From 3a84aef64a1992c75b5638c9475397b6cf24a186 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 5 Nov 2025 10:24:16 -0500
Subject: [PATCH 01/61] [PowerPC][NFC] auto gen checks vec rounding tests
 (#166435)

Update tests to contain auto generated checks.
---
 llvm/test/CodeGen/PowerPC/vec_rounding.ll | 195 +++++++++++++++-------
 1 file changed, 137 insertions(+), 58 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/vec_rounding.ll b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
index 2f16a435440ff..438c8ebdc099e 100644
--- a/llvm/test/CodeGen/PowerPC/vec_rounding.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
@@ -1,172 +1,251 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
 
 ; Check vector round to single-precision toward -infinity (vrfim)
 ; instruction generation using Altivec.
 
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
 declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
 define <2 x double> @floor_v2f64(<2 x double> %p)
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: floor_v2f64:
-; CHECK: frim
-; CHECK: frim
 
 declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
 define <4 x double> @floor_v4f64(<4 x double> %p)
+; CHECK-LABEL: floor_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    frim 3, 3
+; CHECK-NEXT:    frim 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: floor_v4f64:
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
 
 declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
 define <2 x double> @ceil_v2f64(<2 x double> %p)
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: ceil_v2f64:
-; CHECK: frip
-; CHECK: frip
 
 declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
 define <4 x double> @ceil_v4f64(<4 x double> %p)
+; CHECK-LABEL: ceil_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    frip 3, 3
+; CHECK-NEXT:    frip 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: ceil_v4f64:
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
 
 declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
 define <2 x double> @trunc_v2f64(<2 x double> %p)
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: trunc_v2f64:
-; CHECK: friz
-; CHECK: friz
 
 declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
 define <4 x double> @trunc_v4f64(<4 x double> %p)
+; CHECK-LABEL: trunc_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    friz 3, 3
+; CHECK-NEXT:    friz 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: trunc_v4f64:
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
 
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -128(1)
+; CHECK-NEXT:    std 0, 144(1)
+; CHECK-NEXT:    stfd 30, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 31, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 2
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 2, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    lfd 31, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 30, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 128
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: nearbyint_v2f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -144(1)
+; CHECK-NEXT:    std 0, 160(1)
+; CHECK-NEXT:    stfd 28, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 29, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 29, 2
+; CHECK-NEXT:    stfd 30, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 30, 3
+; CHECK-NEXT:    stfd 31, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 4
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 28, 1
+; CHECK-NEXT:    fmr 1, 29
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 29, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 4, 1
+; CHECK-NEXT:    fmr 1, 28
+; CHECK-NEXT:    lfd 31, 136(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 28, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    fmr 2, 29
+; CHECK-NEXT:    fmr 3, 30
+; CHECK-NEXT:    lfd 30, 128(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 29, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 144
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: nearbyint_v4f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 
 declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
 define <4 x float> @floor_v4f32(<4 x float> %p)
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: floor_v4f32:
-; CHECK: vrfim
 
 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
 define <8 x float> @floor_v8f32(<8 x float> %p)
+; CHECK-LABEL: floor_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    vrfim 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: floor_v8f32:
-; CHECK: vrfim
-; CHECK: vrfim
 
 declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
 define <4 x float> @ceil_v4f32(<4 x float> %p)
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: ceil_v4f32:
-; CHECK: vrfip
 
 declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
 define <8 x float> @ceil_v8f32(<8 x float> %p)
+; CHECK-LABEL: ceil_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    vrfip 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: ceil_v8f32:
-; CHECK: vrfip
-; CHECK: vrfip
 
 declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
 define <4 x float> @trunc_v4f32(<4 x float> %p)
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: trunc_v4f32:
-; CHECK: vrfiz
 
 declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
 define <8 x float> @trunc_v8f32(<8 x float> %p)
+; CHECK-LABEL: trunc_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    vrfiz 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: trunc_v8f32:
-; CHECK: vrfiz
-; CHECK: vrfiz
 
 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
 define <4 x float> @nearbyint_v4f32(<4 x float> %p)
+; CHECK-LABEL: nearbyint_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: nearbyint_v4f32:
-; CHECK: vrfin
 
 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
 define <8 x float> @nearbyint_v8f32(<8 x float> %p)
+; CHECK-LABEL: nearbyint_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    vrfin 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: nearbyint_v8f32:
-; CHECK: vrfin
-; CHECK: vrfin

From 9762ab0c3d25b311826f57f0c5b4d601dcede8bc Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Wed, 5 Nov 2025 20:57:26 +0530
Subject: [PATCH 02/61] [MLIR][NVVM] Fix the lowering of mbarrier.test.wait
 (#166555)

PR #165993 accidentally broke the lowering of the `test.wait` Op.

This patch fixes the issue and adds tests to verify the lowering to
intrinsics for all mbarrier Ops, ensuring similar regressions are caught in the
future.

Additionally, the `cp-async-mbarrier` test is moved to the
`mbarriers.mlir` test file to keep all related tests together.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td |   2 +-
 mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir | 116 ++++++++++++++++++++
 mlir/test/Target/LLVMIR/nvvmir.mlir         |  13 ---
 3 files changed, 117 insertions(+), 14 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 10f0cc254ea97..80bc0e5986e51 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -949,7 +949,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   }];
 
   string llvmBuilder = [{
-    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+    auto [id, args] = NVVM::MBarrierTestWaitOp::getIntrinsicIDAndArgs(
                       *op, moduleTranslation, builder);
     $res = createIntrinsicCall(builder, id, args);
   }];
diff --git a/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
new file mode 100644
index 0000000000000..9bb3b082777fd
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
@@ -0,0 +1,116 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
+  // CHECK-LABEL: define void @cp_async_mbarrier_arrive(ptr addrspace(3) %0, ptr %1) {
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_init_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_init_generic(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_init_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_inval_generic(ptr %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_inval_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive(ptr %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
+  // CHECK-LABEL: define i1 @mbarrier_test_wait(ptr %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i1 @llvm.nvvm.mbarrier.test.wait(ptr %0, i64 %1)
+  // CHECK-NEXT: ret i1 %3
+  // CHECK-NEXT: }
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr, i64 -> i1
+  llvm.return %isComplete : i1
+}
+
+llvm.func @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
+  // CHECK-LABEL: define void @mbarrier_test_wait_shared(ptr addrspace(3) %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %4 = call i1 @llvm.nvvm.mbarrier.test.wait.shared(ptr addrspace(3) %0, i64 %1)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 3fc09f371a347..1ec55408e97a5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -531,19 +531,6 @@ llvm.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32)
   llvm.return
 }
 
-// CHECK-LABEL: @cp_async_mbarrier_arrive
-llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
-  llvm.return
-}
-
 // CHECK-LABEL: @llvm_nvvm_setmaxregister
 llvm.func @llvm_nvvm_setmaxregister() {
   // CHECK: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)

From 338fb02c9878761a97ed0e7e44c19d6ed8463434 Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <eyakubova@nvidia.com>
Date: Wed, 5 Nov 2025 15:28:31 +0000
Subject: [PATCH 03/61] =?UTF-8?q?[BOLT][NFC]=20Rename=20funtions=20with=20?=
 =?UTF-8?q?=5Fnegative=20suffix=20to=20=5Funknown=20when=20th=E2=80=A6=20(?=
 =?UTF-8?q?#166536)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…e size is unknown

Keep _negative suffix only for test cases when the size is negative
---
 bolt/test/runtime/AArch64/inline-memcpy.s | 30 +++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index badff299603a0..75066c855b9ed 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -81,14 +81,14 @@
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Register move should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM-LABEL: <test_register_move_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
-# CHECK-ASM-LABEL: <test_x2_rewrite_negative>:
+# CHECK-ASM-LABEL: <test_x2_rewrite_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Live-in parameter should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM-LABEL: <test_live_in_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # _memcpy8 should be inlined with end-pointer return (dest+size)
@@ -262,9 +262,9 @@ test_4_byte_add_immediate:
 	ret
 	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
 
-	.globl	test_register_move_negative
-	.type	test_register_move_negative,@function
-test_register_move_negative:
+	.globl	test_register_move_unknown
+	.type	test_register_move_unknown,@function
+test_register_move_unknown:
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
@@ -274,20 +274,20 @@ test_register_move_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_register_move_negative, .-test_register_move_negative
+	.size	test_register_move_unknown, .-test_register_move_unknown
 
-	.globl  test_x2_rewrite_negative
-	.type   test_x2_rewrite_negative,@function
-test_x2_rewrite_negative:
+	.globl  test_x2_rewrite_unknown
+	.type   test_x2_rewrite_unknown,@function
+test_x2_rewrite_unknown:
 	mov     x2, #8
 	ldr     x2, [sp, #24]
 	bl      memcpy
 	ret
-	.size   test_x2_rewrite_negative, .-test_x2_rewrite_negative
+	.size   test_x2_rewrite_unknown, .-test_x2_rewrite_unknown
 
-	.globl	test_live_in_negative
-	.type	test_live_in_negative,@function
-test_live_in_negative:
+	.globl	test_live_in_unknown
+	.type	test_live_in_unknown,@function
+test_live_in_unknown:
 	# x2 comes in as parameter, no instruction sets it (should NOT inline)
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
@@ -297,7 +297,7 @@ test_live_in_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_live_in_negative, .-test_live_in_negative
+	.size	test_live_in_unknown, .-test_live_in_unknown
 
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte,@function

From 87fb7b002db39af9fbf9351cbd404cf14396bb93 Mon Sep 17 00:00:00 2001
From: Jean-Didier PAILLEUX <jean-didier.pailleux@sipearl.com>
Date: Wed, 5 Nov 2025 16:35:25 +0100
Subject: [PATCH 04/61] [flang] Adding NOTIFY specifier in image selector and
 add notify type checks (#148810)

This PR adds support for the NOTIFY specifier in the image selector as
described in the 2023 standard, and add checks for the NOTIFY_TYPE type.
---
 flang/examples/FeatureList/FeatureList.cpp   |  1 +
 flang/include/flang/Evaluate/traverse.h      |  2 +-
 flang/include/flang/Evaluate/variable.h      |  4 +++
 flang/include/flang/Parser/dump-parse-tree.h |  1 +
 flang/include/flang/Parser/parse-tree.h      |  4 ++-
 flang/include/flang/Semantics/tools.h        |  3 ++
 flang/lib/Evaluate/variable.cpp              | 13 +++++++
 flang/lib/Lower/Support/Utils.cpp            |  5 +--
 flang/lib/Parser/Fortran-parsers.cpp         |  7 ++--
 flang/lib/Parser/unparse.cpp                 |  1 +
 flang/lib/Semantics/check-declarations.cpp   | 13 +++++++
 flang/lib/Semantics/dump-expr.cpp            |  1 +
 flang/lib/Semantics/expression.cpp           | 13 +++++++
 flang/lib/Semantics/tools.cpp                | 38 ++++++++++++++++++++
 flang/test/Semantics/coarrays02.f90          | 17 +++++++++
 flang/test/Semantics/notifywait03.f90        |  1 +
 16 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index ef58da61e371b..bb55a8163d938 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -348,6 +348,7 @@ struct NodeVisitor {
   READ_FEATURE(TeamValue)
   READ_FEATURE(ImageSelector)
   READ_FEATURE(ImageSelectorSpec)
+  READ_FEATURE(ImageSelectorSpec::Notify)
   READ_FEATURE(ImageSelectorSpec::Stat)
   READ_FEATURE(ImageSelectorSpec::Team_Number)
   READ_FEATURE(ImplicitPart)
diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h
index 48aafa8982559..d63c16f93230a 100644
--- a/flang/include/flang/Evaluate/traverse.h
+++ b/flang/include/flang/Evaluate/traverse.h
@@ -146,7 +146,7 @@ class Traverse {
     return Combine(x.base(), x.subscript());
   }
   Result operator()(const CoarrayRef &x) const {
-    return Combine(x.base(), x.cosubscript(), x.stat(), x.team());
+    return Combine(x.base(), x.cosubscript(), x.notify(), x.stat(), x.team());
   }
   Result operator()(const DataRef &x) const { return visitor_(x.u); }
   Result operator()(const Substring &x) const {
diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h
index 5c14421fd3a1b..4f64ede3d407d 100644
--- a/flang/include/flang/Evaluate/variable.h
+++ b/flang/include/flang/Evaluate/variable.h
@@ -260,6 +260,9 @@ class CoarrayRef {
   // it's TEAM=.
   std::optional<Expr<SomeType>> team() const;
   CoarrayRef &set_team(Expr<SomeType> &&);
+  // When notify() is Expr<Some>, it's NOTIFY=.
+  std::optional<Expr<SomeType>> notify() const;
+  CoarrayRef &set_notify(Expr<SomeType> &&);
 
   int Rank() const;
   int Corank() const { return 0; }
@@ -272,6 +275,7 @@ class CoarrayRef {
 private:
   common::CopyableIndirection<DataRef> base_;
   std::vector<Expr<SubscriptInteger>> cosubscript_;
+  std::optional<common::CopyableIndirection<Expr<SomeType>>> notify_;
   std::optional<common::CopyableIndirection<Expr<SomeInteger>>> stat_;
   std::optional<common::CopyableIndirection<Expr<SomeType>>> team_;
 };
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index de2716410d6cd..b2424023b0168 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -387,6 +387,7 @@ class ParseTreeDumper {
   NODE(parser, TeamValue)
   NODE(parser, ImageSelector)
   NODE(parser, ImageSelectorSpec)
+  NODE(ImageSelectorSpec, Notify)
   NODE(ImageSelectorSpec, Stat)
   NODE(ImageSelectorSpec, Team_Number)
   NODE(parser, ImplicitPart)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8c7578f7a1941..32e444fbb2e6c 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -1684,13 +1684,15 @@ using Cosubscript = ScalarIntExpr;
 WRAPPER_CLASS(TeamValue, Scalar<common::Indirection<Expr>>);
 
 // R926 image-selector-spec ->
+//        NOTIFY = notify-variable |
 //        STAT = stat-variable | TEAM = team-value |
 //        TEAM_NUMBER = scalar-int-expr
 struct ImageSelectorSpec {
   WRAPPER_CLASS(Stat, Scalar<Integer<common::Indirection<Variable>>>);
   WRAPPER_CLASS(Team_Number, ScalarIntExpr);
+  WRAPPER_CLASS(Notify, Scalar<common::Indirection<Variable>>);
   UNION_CLASS_BOILERPLATE(ImageSelectorSpec);
-  std::variant<Stat, TeamValue, Team_Number> u;
+  std::variant<Notify, Stat, TeamValue, Team_Number> u;
 };
 
 // R924 image-selector ->
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 8a7b9867c0979..1c3477013b559 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -107,6 +107,7 @@ bool IsBindCProcedure(const Scope &);
 // Returns a pointer to the function's symbol when true, else null
 const Symbol *IsFunctionResultWithSameNameAsFunction(const Symbol &);
 bool IsOrContainsEventOrLockComponent(const Symbol &);
+bool IsOrContainsNotifyComponent(const Symbol &);
 bool CanBeTypeBoundProc(const Symbol &);
 // Does a non-PARAMETER symbol have explicit initialization with =value or
 // =>target in its declaration (but not in a DATA statement)? (Being
@@ -652,6 +653,8 @@ using PotentialAndPointerComponentIterator =
 // dereferenced.
 PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
     const DerivedTypeSpec &, bool ignoreCoarrays = false);
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &, bool ignoreCoarrays = false);
 PotentialComponentIterator::const_iterator FindCoarrayPotentialComponent(
     const DerivedTypeSpec &);
 PotentialAndPointerComponentIterator::const_iterator
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index b9b34d4d5bc89..b257dad42fc58 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -89,6 +89,14 @@ std::optional<Expr<SomeType>> CoarrayRef::team() const {
   }
 }
 
+std::optional<Expr<SomeType>> CoarrayRef::notify() const {
+  if (notify_) {
+    return notify_.value().value();
+  } else {
+    return std::nullopt;
+  }
+}
+
 CoarrayRef &CoarrayRef::set_stat(Expr<SomeInteger> &&v) {
   CHECK(IsVariable(v));
   stat_.emplace(std::move(v));
@@ -100,6 +108,11 @@ CoarrayRef &CoarrayRef::set_team(Expr<SomeType> &&v) {
   return *this;
 }
 
+CoarrayRef &CoarrayRef::set_notify(Expr<SomeType> &&v) {
+  notify_.emplace(std::move(v));
+  return *this;
+}
+
 const Symbol &CoarrayRef::GetFirstSymbol() const {
   return base().GetFirstSymbol();
 }
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index 1b4d37e9798a9..4b95a3adf052a 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -82,7 +82,7 @@ class HashEvaluateExpr {
          x.cosubscript())
       cosubs -= getHashValue(v);
     return getHashValue(x.base()) * 97u - cosubs + getHashValue(x.stat()) +
-           257u + getHashValue(x.team());
+           257u + getHashValue(x.team()) + getHashValue(x.notify());
   }
   static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) {
     if (x.IsSymbol())
@@ -341,7 +341,8 @@ class IsEqualEvaluateExpr {
                       const Fortran::evaluate::CoarrayRef &y) {
     return isEqual(x.base(), y.base()) &&
            isEqual(x.cosubscript(), y.cosubscript()) &&
-           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team());
+           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team()) &&
+           isEqual(x.notify(), y.notify());
   }
   static bool isEqual(const Fortran::evaluate::NamedEntity &x,
                       const Fortran::evaluate::NamedEntity &y) {
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index 59fe7d813d96a..ea6a1eada2741 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1212,12 +1212,15 @@ TYPE_CONTEXT_PARSER("image selector"_en_US,
 
 // R926 image-selector-spec ->
 //        STAT = stat-variable | TEAM = team-value |
-//        TEAM_NUMBER = scalar-int-expr
+//        TEAM_NUMBER = scalar-int-expr |
+//        NOTIFY = notify-variable
 TYPE_PARSER(construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Stat>(
                 "STAT =" >> scalar(integer(indirect(variable))))) ||
     construct<ImageSelectorSpec>(construct<TeamValue>("TEAM =" >> teamValue)) ||
     construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Team_Number>(
-        "TEAM_NUMBER =" >> scalarIntExpr)))
+        "TEAM_NUMBER =" >> scalarIntExpr)) ||
+    construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Notify>(
+        "NOTIFY =" >> scalar(indirect(variable)))))
 
 // R927 allocate-stmt ->
 //        ALLOCATE ( [type-spec ::] allocation-list [, alloc-opt-list] )
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 84123030195e9..6bb14a43e7b99 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -819,6 +819,7 @@ class UnparseVisitor {
       Word("TEAM=");
     }
   }
+  void Before(const ImageSelectorSpec::Notify &) { Word("NOTIFY="); }
   void Unparse(const AllocateStmt &x) { // R927
     Word("ALLOCATE(");
     Walk(std::get<std::optional<TypeSpec>>(x.t), "::");
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index de407d3b1e125..9a6b3ff3cdc2c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -855,6 +855,15 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "Variable '%s' with EVENT_TYPE or LOCK_TYPE potential component '%s' must be a coarray"_err_en_US,
             symbol.name(), component.BuildResultDesignatorName());
+      } else if (IsNotifyType(derived)) { // C1612
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE must be a coarray"_err_en_US,
+            symbol.name());
+      } else if (auto component{FindNotifyPotentialComponent( // C1611
+                     *derived, /*ignoreCoarrays=*/true)}) {
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE potential component '%s' must be a coarray"_err_en_US,
+            symbol.name(), component.BuildResultDesignatorName());
       }
     }
   }
@@ -873,6 +882,10 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US);
       }
+      if (IsOrContainsNotifyComponent(symbol)) { // C1613
+        messages_.Say(
+            "An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE"_err_en_US);
+      }
       if (IsAssumedSizeArray(symbol)) { // C834
         if (type && type->IsPolymorphic()) {
           messages_.Say(
diff --git a/flang/lib/Semantics/dump-expr.cpp b/flang/lib/Semantics/dump-expr.cpp
index 66cedab94bfb4..8d354cf65b61e 100644
--- a/flang/lib/Semantics/dump-expr.cpp
+++ b/flang/lib/Semantics/dump-expr.cpp
@@ -23,6 +23,7 @@ void DumpEvaluateExpr::Show(const evaluate::CoarrayRef &x) {
   Indent("coarray ref");
   Show(x.base());
   Show(x.cosubscript());
+  Show(x.notify());
   Show(x.stat());
   Show(x.team());
   Outdent();
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index c8167fd34f666..ac58dfc005f17 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1579,6 +1579,19 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) {
         std::get<std::list<parser::ImageSelectorSpec>>(x.imageSelector.t)) {
       common::visit(
           common::visitors{
+              [&](const parser::ImageSelectorSpec::Notify &x) {
+                Analyze(x.v);
+                if (const auto *expr{GetExpr(context_, x.v)}) {
+                  if (coarrayRef.notify()) {
+                    Say("coindexed reference has multiple NOTIFY= specifiers"_err_en_US);
+                  } else if (auto dyType{expr->GetType()};
+                      dyType && IsNotifyType(GetDerivedTypeSpec(*dyType))) {
+                    coarrayRef.set_notify(Expr<SomeType>{*expr});
+                  } else {
+                    Say("NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV"_err_en_US);
+                  }
+                }
+              },
               [&](const parser::ImageSelectorSpec::Stat &x) {
                 Analyze(x.v);
                 if (const auto *expr{GetExpr(context_, x.v)}) {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 8eddd03faa962..cf1e5e7d44565 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -582,6 +582,18 @@ bool IsOrContainsEventOrLockComponent(const Symbol &original) {
   return false;
 }
 
+bool IsOrContainsNotifyComponent(const Symbol &original) {
+  const Symbol &symbol{ResolveAssociations(original, /*stopAtTypeGuard=*/true)};
+  if (evaluate::IsVariable(symbol)) {
+    if (const DeclTypeSpec *type{symbol.GetType()}) {
+      if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+        return IsNotifyType(derived) || FindNotifyPotentialComponent(*derived);
+      }
+    }
+  }
+  return false;
+}
+
 // Check this symbol suitable as a type-bound procedure - C769
 bool CanBeTypeBoundProc(const Symbol &symbol) {
   if (IsDummy(symbol) || IsProcedurePointer(symbol)) {
@@ -1489,6 +1501,32 @@ PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
   return iter;
 }
 
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &derived, bool ignoreCoarrays) {
+  PotentialComponentIterator potentials{derived};
+  auto iter{potentials.begin()};
+  for (auto end{potentials.end()}; iter != end; ++iter) {
+    const Symbol &component{*iter};
+    if (const auto *object{component.detailsIf<ObjectEntityDetails>()}) {
+      if (const DeclTypeSpec *type{object->type()}) {
+        if (IsNotifyType(type->AsDerived())) {
+          if (!ignoreCoarrays) {
+            break; // found one
+          }
+          auto path{iter.GetComponentPath()};
+          path.pop_back();
+          if (std::find_if(path.begin(), path.end(), [](const Symbol &sym) {
+                return evaluate::IsCoarray(sym);
+              }) == path.end()) {
+            break; // found one not in a coarray
+          }
+        }
+      }
+    }
+  }
+  return iter;
+}
+
 UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
     const DerivedTypeSpec &derived) {
   UltimateComponentIterator ultimates{derived};
diff --git a/flang/test/Semantics/coarrays02.f90 b/flang/test/Semantics/coarrays02.f90
index b16e0ccb58797..e866dd89c07ab 100644
--- a/flang/test/Semantics/coarrays02.f90
+++ b/flang/test/Semantics/coarrays02.f90
@@ -16,6 +16,8 @@ program main
   type(event_type) event
   !ERROR: Variable 'lock' with EVENT_TYPE or LOCK_TYPE must be a coarray
   type(lock_type) lock
+  !ERROR: Variable 'notify' with NOTIFY_TYPE must be a coarray
+  type(notify_type) notify
   integer :: local[*] ! ok in main
 end
 
@@ -120,3 +122,18 @@ subroutine s4
   !ERROR: Subscripts must appear in a coindexed reference when its base is an array
   print *, ta(1)%a[1]
 end
+
+subroutine s5(a, notify, res)
+  use iso_fortran_env
+  type t
+    type(notify_type) :: a
+  end type
+  real, intent(in) :: a[*]
+  type(event_type), intent(in) :: notify[*]
+  !ERROR: An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE
+  type(notify_type), intent(out) :: res[*]
+  !ERROR: Variable 'bad' with NOTIFY_TYPE potential component '%a' must be a coarray
+  type(t) :: bad
+  !ERROR: NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV
+  print *, a[1, NOTIFY=notify]
+end
diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90
index 0fc56f66ad32d..a336a7a67669a 100644
--- a/flang/test/Semantics/notifywait03.f90
+++ b/flang/test/Semantics/notifywait03.f90
@@ -10,6 +10,7 @@ program test_notify_wait
   implicit none
 
   ! notify_type variables must be coarrays
+  !ERROR: Variable 'non_coarray' with NOTIFY_TYPE must be a coarray
   type(notify_type) :: non_coarray
 
   type(notify_type) :: notify_var[*], notify_array(2)[*]

From 52cb6e9d49f836b624bd0536734afd7aa4194ca0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 5 Nov 2025 07:40:37 -0800
Subject: [PATCH 05/61] [ProfCheck][NFC] Make Function argument from branch
 weight setter optional (#166032)

This picks up from #166028, making the `Function` argument optional:
most cases don't need to provide it, but in e.g. InstCombine's case,
where the instruction (select, branch) is not attached to a function
yet, the function needs to be passed explicitly.

Co-authored-by: Florian Hahn <flo@fhahn.com>
---
 llvm/include/llvm/IR/ProfDataUtils.h                     | 9 +++++----
 llvm/lib/IR/IRBuilder.cpp                                | 3 +--
 llvm/lib/IR/ProfDataUtils.cpp                            | 9 ++++++---
 .../AggressiveInstCombine/AggressiveInstCombine.cpp      | 3 +--
 llvm/lib/Transforms/InstCombine/InstCombineInternal.h    | 2 +-
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp        | 3 +--
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp                | 3 +--
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index a0876b169e0b8..a7bcbf010d1bf 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -194,10 +194,11 @@ LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I,
 /// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch
 /// weights in the new instruction if the parent function of the original
 /// instruction has an entry count. This is to not confuse users by injecting
-/// profile data into non-profiled functions.
-LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                          Function &F,
-                                                          StringRef PassName);
+/// profile data into non-profiled functions. If \p F is nullptr, we will fetch
+/// the function from \p I.
+LLVM_ABI void
+setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName,
+                                            const Function *F = nullptr);
 
 /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their
 /// entry counts.
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 88dbd176e0d3f..95edb2e8e56d8 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
                                                      const Twine &Name) {
   Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
   if (auto *SI = dyn_cast<SelectInst>(Ret)) {
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *SI, *SI->getParent()->getParent(), PassName);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName);
   }
   return Ret;
 }
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index fc2be5188f456..94dbe1f3988b8 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I,
 }
 
 void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                       Function &F,
-                                                       StringRef PassName) {
-  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+                                                       StringRef PassName,
+                                                       const Function *F) {
+  F = F ? F : I.getFunction();
+  assert(F && "Either pass a instruction attached to a Function, or explicitly "
+              "pass the Function that it will be attached to");
+  if (std::optional<Function::ProfileCount> EC = F->getEntryCount();
       EC && EC->getCount() > 0)
     setExplicitlyUnknownBranchWeights(I, PassName);
 }
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7a95df4b2a47c..b575d76e897d2 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
       IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
   // We can't know the precise weights here, as they would depend on the value
   // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
-  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
-                                              DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
   Type *IndexTy = DL.getIndexType(Call->getType());
   SmallVector<DominatorTree::UpdateType, 8> Updates;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7590197..9bdd8cb71f7f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -479,7 +479,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                      const Twine &NameStr = "",
                                      InsertPosition InsertBefore = nullptr) {
     auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F);
     return Sel;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 0577ddbd2353c..7930b38174e49 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -330,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch(
       HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
                        : nullptr);
   if (!HasBranchWeights)
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Copy a set of loop invariant values, and conditionally branch on them.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 3a3e3ade20212..9a8dbebe5bfba 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5214,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
     // We don't have any info about this condition.
     auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
                              : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
-                                                DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
 
     OldTI->eraseFromParent();
 

From 4334b43c6593f3839d33806131fd36c620390cbe Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 5 Nov 2025 10:43:56 -0500
Subject: [PATCH 06/61] [gn] port bb4ed55acdbc

---
 .../source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn  | 10 ++++++++--
 llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn    |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
index 9848efef70568..fa99fa8649caf 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
@@ -1,10 +1,16 @@
 static_library("CPlusPlus") {
   output_name = "lldbPluginCPPRuntime"
-  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  configs += [
+    "//llvm/utils/gn/build:clang_code",
+    "//llvm/utils/gn/build:lldb_code",
+  ]
   deps = [
     "//lldb/source/Core",
     "//lldb/source/Symbol",
     "//lldb/source/Target",
   ]
-  sources = [ "CPPLanguageRuntime.cpp" ]
+  sources = [
+    "CPPLanguageRuntime.cpp",
+    "VerboseTrapFrameRecognizer.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index a863baf912051..783eb96283596 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -105,6 +105,5 @@ static_library("Target") {
     "UnixSignals.cpp",
     "UnwindAssembly.cpp",
     "UnwindLLDB.cpp",
-    "VerboseTrapFrameRecognizer.cpp",
   ]
 }

From fa6cc7eadedd61347456f088ef40f6dfdcd751e2 Mon Sep 17 00:00:00 2001
From: Doug Wyatt <doug@sonosphere.com>
Date: Wed, 5 Nov 2025 07:45:33 -0800
Subject: [PATCH 07/61] [Clang] FunctionEffects: ignore (methods of) local
 CXXRecordDecls. (#166078)

In the following example, `Functor::method()` inappropriately triggers a
diagnostic that `outer()` is blocking by allocating memory.

```
void outer() [[clang::nonblocking]]
{
	struct Functor {
		int* ptr;

		void method() { ptr = new int; }
	};
}
```

---------

Co-authored-by: Doug Wyatt <dwyatt@apple.com>
---
 clang/lib/Sema/SemaFunctionEffects.cpp        |  8 ++++++++
 .../Sema/attr-nonblocking-constraints.cpp     | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 4b63eb7df1054..12cc02965e7d3 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1302,6 +1302,14 @@ class Analyzer {
       return true;
     }
 
+    bool TraverseCXXRecordDecl(CXXRecordDecl *D) override {
+      // Completely skip local struct/class/union declarations since their
+      // methods would otherwise be incorrectly interpreted as part of the
+      // function we are currently traversing. The initial Sema pass will have
+      // already recorded any nonblocking methods needing analysis.
+      return true;
+    }
+
     bool TraverseConstructorInitializer(CXXCtorInitializer *Init) override {
       ViolationSite PrevVS = VSite;
       if (Init->isAnyMemberInitializer())
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index 881e816292d59..012c017798a1f 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -104,6 +104,25 @@ void nb8c()
 	};
 }
 
+void nb8d() [[clang::nonblocking]]
+{
+	// Blocking methods of a local CXXRecordDecl do not generate diagnostics
+	// for the outer function.
+	struct F1 {
+        void method() { void* ptr = new int; }
+	};
+
+	// Skipping the CXXRecordDecl does not skip a following VarDecl.
+	struct F2 {
+        F2() { void* ptr = new int; } // expected-note {{constructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	} f2; // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' constructor 'nb8d()::F2::F2'}}
+
+	// Nonblocking methods of a local CXXRecordDecl are verified independently.
+	struct F3 {
+		void method() [[clang::nonblocking]] { void* ptr = new int; }// expected-warning {{function with 'nonblocking' attribute must not allocate or deallocate memory}}
+	};
+}
+
 // Make sure template expansions are found and verified.
 	template <typename T>
 	struct Adder {

From d568601d5a0c7c315d8038f4dc24e0ccd97a1ba0 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 5 Nov 2025 07:46:12 -0800
Subject: [PATCH 08/61] [NFC][TableGen] Adopt NamespaceEmitter in
 DirectiveEmitter (#165600)

---
 llvm/test/TableGen/directive1.td              |  7 ++--
 llvm/test/TableGen/directive2.td              |  7 ++--
 .../utils/TableGen/Basic/DirectiveEmitter.cpp | 36 ++++++-------------
 3 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 8648651f3d714..5bd7890e0ddd1 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -186,8 +186,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:       #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT:  #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT:  namespace llvm {
-// IMPL-NEXT:  namespace tdl {
+// IMPL-NEXT:  namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:    // Sets for dira
 // IMPL-EMPTY:
@@ -204,8 +203,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:    static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:    };
-// IMPL-NEXT:  } // namespace tdl
-// IMPL-NEXT:  } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT:  } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT:  #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 96022d7647440..eaaf82ddaaf41 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -159,8 +159,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:      #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT: #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT: namespace llvm {
-// IMPL-NEXT: namespace tdl {
+// IMPL-NEXT: namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:   // Sets for dira
 // IMPL-EMPTY:
@@ -177,8 +176,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:   static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:   };
-// IMPL-NEXT: } // namespace tdl
-// IMPL-NEXT: } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT: } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT: #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index d33bf45595e2e..0bb743dc8a7f5 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -359,7 +359,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "  static constexpr bool is_iterable = true;\n";
     OS << "};\n";
   }
-  LlvmNS.close();
 }
 
 // Given a list of spellings (for a given clause/directive), order them
@@ -931,27 +930,20 @@ static void generateClauseSet(ArrayRef<const Record *> VerClauses,
 // Generate an enum set for the 4 kinds of clauses linked to a directive.
 static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
                                         Frontend FE, raw_ostream &OS) {
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_SETS");
 
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_SETS";
-  IfDefEmitter Scope(OS, IfDefName);
-
-  StringRef Namespace =
-      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE);
+  std::string Namespace =
+      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE).str();
   // The namespace has to be different for clang vs flang, as 2 structs with the
   // same name but different layout is UB.  So just put the 'clang' on in the
   // clang namespace.
-  OS << "namespace " << Namespace << " {\n";
-
-  // Open namespaces defined in the directive language.
-  SmallVector<StringRef, 2> Namespaces;
-  SplitString(DirLang.getCppNamespace(), Namespaces, "::");
-  for (auto Ns : Namespaces)
-    OS << "namespace " << Ns << " {\n";
+  // Additionally, open namespaces defined in the directive language.
+  if (!DirLang.getCppNamespace().empty())
+    Namespace += "::" + DirLang.getCppNamespace().str();
+  NamespaceEmitter NS(OS, Namespace);
 
   for (const Directive Dir : DirLang.getDirectives()) {
-    OS << "\n";
     OS << "// Sets for " << Dir.getSpellingForIdentifier() << "\n";
 
     generateClauseSet(Dir.getAllowedClauses(), OS, "allowedClauses_", Dir,
@@ -963,12 +955,6 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
     generateClauseSet(Dir.getRequiredClauses(), OS, "requiredClauses_", Dir,
                       DirLang, FE);
   }
-
-  // Closing namespaces
-  for (auto Ns : reverse(Namespaces))
-    OS << "} // namespace " << Ns << "\n";
-
-  OS << "} // namespace " << Namespace << "\n";
 }
 
 // Generate a map of directive (key) with DirectiveClauses struct as values.
@@ -976,10 +962,8 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
 // allowances (allowed, allowed once, allowed exclusive and required).
 static void generateDirectiveClauseMap(const DirectiveLanguage &DirLang,
                                        Frontend FE, raw_ostream &OS) {
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_MAP";
-  IfDefEmitter Scope(OS, IfDefName);
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_MAP");
 
   OS << "{\n";
 

From 3641e269b0fdf3614b6a2a068678d8431204a489 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Wed, 5 Nov 2025 16:48:24 +0100
Subject: [PATCH 09/61] [MsDemangle] Read entire chain of target names in
 special tables (#155630)

When there's a deep inheritance hierarchy of multiple C++ classes (see
below), then the mangled name of a VFTable can include multiple key
nodes in the target name.
For example, in the following code, MSVC will generate mangled names for
the VFTables that have up to three key classes in the context.
<details><summary>Code</summary>

```cpp
class Base1 {
  virtual void a() {};
};
class Base2 {
  virtual void b() {}
};

class Ind1 : public Base1 {};
class Ind2 : public Base1 {};

class A : public Ind1, public Ind2 {};

class Ind3 : public A {};
class Ind4 : public A {};

class B : public Ind3, public Ind4 {};

class Ind5 : public B {};
class Ind6 : public B {};

class C : public Ind5, public Ind6 {};

int main() { auto i = new C; }
```
</details>

This will include `??_7C@@6BInd1@@Ind4@@Ind5@@@` (and every other
combination). Microsoft's undname will demangle this to "const
C::\`vftable'{for \`Ind1's \`Ind4's \`Ind5'}". Previously, LLVM would
demangle this to "const C::\`vftable'{for \`Ind1'}".

With this PR, the output of LLVM's undname will be identical to
Microsoft's version. This changes `SpecialTableSymbolNode::TargetName`
to a node array which contains each key from the name. Unlike
namespaces, these keys are not in reverse order - they are in the same
order as in the mangled name.
---
 .../llvm/Demangle/MicrosoftDemangleNodes.h    |  2 +-
 llvm/lib/Demangle/MicrosoftDemangle.cpp       | 26 +++++++++++++++++--
 llvm/lib/Demangle/MicrosoftDemangleNodes.cpp  |  4 +--
 llvm/test/Demangle/ms-operators.test          | 15 +++++++++++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
index 155cfe8dd3a98..711aa70a4a8d3 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -708,7 +708,7 @@ struct DEMANGLE_ABI SpecialTableSymbolNode : public SymbolNode {
     return N->kind() == NodeKind::SpecialTableSymbol;
   }
 
-  QualifiedNameNode *TargetName = nullptr;
+  NodeArrayNode *TargetNames = nullptr;
   Qualifiers Quals = Qualifiers::Q_None;
 };
 
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index b22928be3be50..250d382998982 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -15,6 +15,8 @@
 
 #include "llvm/Demangle/MicrosoftDemangle.h"
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Demangle/DemangleConfig.h"
 #include "llvm/Demangle/MicrosoftDemangleNodes.h"
@@ -277,6 +279,15 @@ demanglePointerCVQualifiers(std::string_view &MangledName) {
   DEMANGLE_UNREACHABLE;
 }
 
+static NodeArrayNode *smallVecToNodeArray(ArenaAllocator &Arena,
+                                          ArrayRef<Node *> Vec) {
+  NodeArrayNode *Arr = Arena.alloc<NodeArrayNode>();
+  Arr->Count = Vec.size();
+  Arr->Nodes = Arena.allocArray<Node *>(Vec.size());
+  std::memcpy(Arr->Nodes, Vec.data(), Vec.size() * sizeof(Node *));
+  return Arr;
+}
+
 std::string_view Demangler::copyString(std::string_view Borrowed) {
   char *Stable = Arena.allocUnalignedBuffer(Borrowed.size());
   // This is not a micro-optimization, it avoids UB, should Borrowed be an null
@@ -323,8 +334,19 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName,
   }
 
   std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName);
-  if (!consumeFront(MangledName, '@'))
-    STSN->TargetName = demangleFullyQualifiedTypeName(MangledName);
+
+  SmallVector<Node *, 1> TargetNames;
+  while (!consumeFront(MangledName, '@')) {
+    QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName);
+    if (Error)
+      return nullptr;
+    assert(QN);
+    TargetNames.push_back(QN);
+  }
+
+  if (!TargetNames.empty())
+    STSN->TargetNames = smallVecToNodeArray(Arena, TargetNames);
+
   return STSN;
 }
 
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 61e4961c714bc..17c6aab500049 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB,
 void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   outputQualifiers(OB, Quals, false, true);
   Name->output(OB, Flags);
-  if (TargetName) {
+  if (TargetNames) {
     OB << "{for `";
-    TargetName->output(OB, Flags);
+    TargetNames->output(OB, Flags, "'s `");
     OB << "'}";
   }
 }
diff --git a/llvm/test/Demangle/ms-operators.test b/llvm/test/Demangle/ms-operators.test
index b940488786631..cafa1ae3c0663 100644
--- a/llvm/test/Demangle/ms-operators.test
+++ b/llvm/test/Demangle/ms-operators.test
@@ -143,9 +143,24 @@
 ??_7A@B@@6BC@D@@@
 ; CHECK: const B::A::`vftable'{for `D::C'}
 
+??_7A@B@@6BC@D@@E@F@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E'}
+
+??_7A@B@@6BC@D@@E@F@@G@H@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E's `H::G'}
+
 ??_8Middle2@@7B@
 ; CHECK: const Middle2::`vbtable'
 
+??_7A@@6BB@@@
+; CHECK: const A::`vftable'{for `B'}
+
+??_7A@@6BB@@C@@@
+; CHECK: const A::`vftable'{for `B's `C'}
+
+??_7A@@6BB@@C@@D@@@
+; CHECK: const A::`vftable'{for `B's `C's `D'}
+
 ??_9Base@@$B7AA
 ; CHECK: [thunk]: __cdecl Base::`vcall'{8, {flat}}
 

From ff108f7486fafb13f330cec324f59a04442d01d4 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 5 Nov 2025 08:02:55 -0800
Subject: [PATCH 10/61] Fix failures introduced in #166032 (#166574)

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp             | 2 +-
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 6412949948c07..0b55c03a46747 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1301,7 +1301,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
   // hard to predict precise branch weigths we mark the branch as "unknown"
   // (50/50) to prevent misleading optimizations.
-  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return Loaded;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 7930b38174e49..0f3e66476f055 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -388,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
       IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                        Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
   if (!ProfData)
-    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
-                                                DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -3203,8 +3202,7 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   auto *InvariantBr =
       Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
   // We don't know anything about the relation between the limits.
-  setExplicitlyUnknownBranchWeightsIfProfiled(
-      *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(CheckBlock);
   Builder.CreateCondBr(

From a796d1836930bc721686a728852b820a2ea4de92 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 16:03:31 +0000
Subject: [PATCH 11/61] [gn build] Port 370058777be2

---
 llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index 1a890f6733597..a234d2be67f66 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -12,7 +12,6 @@ static_library("BinaryFormat") {
     "ELF.cpp",
     "MachO.cpp",
     "Magic.cpp",
-    "Minidump.cpp",
     "MsgPackDocument.cpp",
     "MsgPackDocumentYAML.cpp",
     "MsgPackReader.cpp",

From ef6947b098e8086ac0d72086b1c63cb8d82ba797 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 16:03:32 +0000
Subject: [PATCH 12/61] [gn build] Port 3ebed51e997b

---
 llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
index 065fc6cdd74a3..bd8d9610c2a4a 100644
--- a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
@@ -17,6 +17,7 @@ unittest("ClangCodeGenTests") {
     "BufferSourceTest.cpp",
     "CheckTargetFeaturesTest.cpp",
     "CodeGenExternalTest.cpp",
+    "DemangleTrapReasonInDebugInfo.cpp",
     "TBAAMetadataTest.cpp",
   ]
 }

From 9bb67f88ed287ebf386ed7579d05abd5d214e52a Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 16:03:33 +0000
Subject: [PATCH 13/61] [gn build] Port 51d0f6d6e172

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 .../gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn      | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 2f84999621e1b..3c3fdf7e16885 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -82,6 +82,7 @@ static_library("bugprone") {
     "SmartPtrArrayMismatchCheck.cpp",
     "SpuriouslyWakeUpFunctionsCheck.cpp",
     "StandaloneEmptyCheck.cpp",
+    "StdNamespaceModificationCheck.cpp",
     "StringConstructorCheck.cpp",
     "StringIntegerAssignmentCheck.cpp",
     "StringLiteralWithEmbeddedNulCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index ec642b6afad66..1eae289143b5b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "DontModifyStdNamespaceCheck.cpp",
     "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
     "MutatingCopyCheck.cpp",

From 3bf0ce15f80ddffe0fff1a52500c79b9bcd011e4 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 16:03:34 +0000
Subject: [PATCH 14/61] [gn build] Port 718a3b268fcf

---
 llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
index 393309ee39bfe..a261f2866be47 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
@@ -12,7 +12,7 @@ static_library("Passes") {
     "//llvm/utils/gn/build/libs/pthread",
   ]
   sources = [
-    "ADRRelaxationPass.cpp",
+    "AArch64RelaxationPass.cpp",
     "Aligner.cpp",
     "AllocCombiner.cpp",
     "AsmDump.cpp",

From ce5dac67ffddfa9b9f141b35a4bf05bdee970676 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 16:03:35 +0000
Subject: [PATCH 15/61] [gn build] Port dd14eb8242d7

---
 llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index a1f5b475e2096..ad72c0069237d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -151,6 +151,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVMoveMerger.cpp",
     "RISCVOptWInstrs.cpp",
     "RISCVPostRAExpandPseudoInsts.cpp",
+    "RISCVPromoteConstant.cpp",
     "RISCVPushPopOptimizer.cpp",
     "RISCVRedundantCopyElimination.cpp",
     "RISCVRegisterInfo.cpp",

From bb367c14aea99ab744406f3e3a700186fd3f1ad6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:23:22 -0800
Subject: [PATCH 16/61] [polly] Fix unused variable warnings

This patch fixes:

  polly/lib/Transform/ScheduleOptimizer.cpp:935:17: error: unused
  variable 'File' [-Werror,-Wunused-variable]

  polly/lib/Transform/ScheduleOptimizer.cpp:936:9: error: unused
  variable 'Line' [-Werror,-Wunused-variable]

  polly/lib/Transform/ScheduleOptimizer.cpp:937:17: error: unused
  variable 'Msg' [-Werror,-Wunused-variable]
---
 polly/lib/Transform/ScheduleOptimizer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index cb08397c201f2..f01d3decd9a1c 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -932,13 +932,14 @@ static void runIslScheduleOptimizer(
     POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     return;
   } else if (isl_ctx_last_error(Ctx) != isl_error_none) {
-    const char *File = isl_ctx_last_error_file(Ctx);
-    int Line = isl_ctx_last_error_line(Ctx);
-    const char *Msg = isl_ctx_last_error_msg(Ctx);
-    POLLY_DEBUG(
-        dbgs()
-        << "ISL reported an error during the computation of a new schedule at "
-        << File << ":" << Line << ": " << Msg);
+    POLLY_DEBUG({
+      const char *File = isl_ctx_last_error_file(Ctx);
+      int Line = isl_ctx_last_error_line(Ctx);
+      const char *Msg = isl_ctx_last_error_msg(Ctx);
+      dbgs() << "ISL reported an error during the computation of a new "
+                "schedule at "
+             << File << ":" << Line << ": " << Msg;
+    });
     isl_ctx_reset_error(Ctx);
     return;
   } else if (Schedule.is_null()) {

From 0b5a00aab73967026292d0c1c84b87fabdcc2648 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Wed, 5 Nov 2025 08:40:55 -0800
Subject: [PATCH 17/61] [CI][NFC] Reformat Python Files in .ci directory

Reviewers:

Pull Request: https://github.com/llvm/llvm-project/pull/166587
---
 .ci/generate_test_report_github.py | 1 +
 .ci/generate_test_report_lib.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py
index 6785e82f3440b..08387de817467 100644
--- a/.ci/generate_test_report_github.py
+++ b/.ci/generate_test_report_github.py
@@ -8,6 +8,7 @@
 
 import generate_test_report_lib
 
+
 def compute_platform_title() -> str:
     logo = ":window:" if platform.system() == "Windows" else ":penguin:"
     # On Linux the machine value is x86_64 on Windows it is AMD64.
diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 7820fbda803d7..0c025c561f6f7 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -100,6 +100,7 @@ def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
         )
     return output
 
+
 def get_failures(junit_objects) -> dict[str, list[tuple[str, str]]]:
     failures = {}
     for results in junit_objects:

From 99334f74ae1cd57634c87975e32c797bed3865ce Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:41:12 -0800
Subject: [PATCH 18/61] [ADT] Add static_assert to llvm::to_address for
 function types (#166505)

This patch aligns llvm::to_address with C++20 std::to_address by
adding a static_assert to prevent instantiation with function types.
The C++20 standard says that std::to_address is ill-formed on a
function type.
---
 llvm/include/llvm/ADT/STLForwardCompat.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index ad94cdede9288..b975a403cd042 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -142,7 +142,10 @@ struct identity // NOLINT(readability-identifier-naming)
 /// The std::pointer_traits<>::to_address(p) variations of these overloads has
 /// not been implemented.
 template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
-template <class T> constexpr T *to_address(T *P) { return P; }
+template <class T> constexpr T *to_address(T *P) {
+  static_assert(!std::is_function_v<T>);
+  return P;
+}
 
 //===----------------------------------------------------------------------===//
 //     Features from C++23

From ab02808c66b0e14c35356c378399ca04a9bc7271 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:41:24 -0800
Subject: [PATCH 19/61] [Support] Simplify minIntN and isUIntN (NFC) (#166506)

---
 llvm/include/llvm/Support/MathExtras.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 9bbb8a2a30541..0a253efc2abcb 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -225,7 +225,7 @@ inline constexpr int64_t minIntN(int64_t N) {
 
   if (N == 0)
     return 0;
-  return UINT64_C(1) + ~(UINT64_C(1) << (N - 1));
+  return UINT64_MAX << (N - 1);
 }
 
 /// Gets the maximum value for a N-bit signed integer.
@@ -241,7 +241,7 @@ inline constexpr int64_t maxIntN(int64_t N) {
 
 /// Checks if an unsigned integer fits into the given (dynamic) bit width.
 inline constexpr bool isUIntN(unsigned N, uint64_t x) {
-  return N >= 64 || x <= maxUIntN(N);
+  return N >= 64 || (x >> N) == 0;
 }
 
 /// Checks if an signed integer fits into the given (dynamic) bit width.

From 0b29c3c1a1bd3d57961dbba7a39b8dae29b4ecf2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:41:32 -0800
Subject: [PATCH 20/61] [Hexagon] Remove redundant declarations (NFC) (#166507)

These two functions are decalred in Hexagon.h.

Identified with readability-redundant-declaration.
---
 llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index 479ac90b7d526..f29a739cb5c07 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -104,13 +104,6 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
     {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
 } // namespace
 
-namespace llvm {
-
-FunctionPass *createHexagonQFPOptimizer();
-void initializeHexagonQFPOptimizerPass(PassRegistry &);
-
-} // namespace llvm
-
 namespace {
 
 struct HexagonQFPOptimizer : public MachineFunctionPass {

From aea75d059f24b4f191d7601f68d61b28f78c4d4d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:41:39 -0800
Subject: [PATCH 21/61] [ObjectYAML] Remove redundant declarations (NFC)
 (#166508)

In C++17, static constexpr members are implicitly inline, so they no
longer require an out-of-line definition.

Identified with readability-redundant-declaration.
---
 llvm/lib/ObjectYAML/ELFYAML.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e5e5fc20728e8..29f291614ffc6 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -37,8 +37,6 @@ unsigned Object::getMachine() const {
     return *Header.Machine;
   return llvm::ELF::EM_NONE;
 }
-
-constexpr StringRef SectionHeaderTable::TypeStr;
 } // namespace ELFYAML
 
 namespace yaml {

From d7c1df38b99bd3ab01b50cfe5fb6b0b2e5044091 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 5 Nov 2025 08:41:47 -0800
Subject: [PATCH 22/61] [llvm] Proofread GoldPlugin.rst (#166509)

---
 llvm/docs/GoldPlugin.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/GoldPlugin.rst b/llvm/docs/GoldPlugin.rst
index 07d2fc203eba5..606f9e0820e60 100644
--- a/llvm/docs/GoldPlugin.rst
+++ b/llvm/docs/GoldPlugin.rst
@@ -83,7 +83,7 @@ which is why you otherwise need gold to be the installed system linker in
 your path.
 
 ``ar`` and ``nm`` also accept the ``-plugin`` option and it's possible to
-to install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
+install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
 If you built your own gold, be sure to install the ``ar`` and ``nm-new`` you
 built to ``/usr/bin``.
 
@@ -143,7 +143,7 @@ Quickstart for using LTO with autotooled projects
 =================================================
 
 Once your system ``ld``, ``ar``, and ``nm`` all support LLVM bitcode,
-everything is in place for an easy to use LTO build of autotooled projects:
+everything is in place for an easy-to-use LTO build of autotooled projects:
 
 * Follow the instructions :ref:`on how to build LLVMgold.so
   <lto-how-to-build>`.

From 6fec104b45734034f3772747e5adb47b1f7ee658 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Wed, 5 Nov 2025 17:42:44 +0100
Subject: [PATCH 23/61] [AMDGPU] Enable typechecks for
 __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16 (#166547)

We didn't remove the `t` for this builtin in the past due to not being
sure if we should use `float16/half`.

This patch doesn't fix the _Float16/half question, I'll address that in
a separate patch later (after discussing the options on our weekly
meeting). At the moment we maintain the `h` for this builtin (which is
likely not what we want for HIP).
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def                | 2 +-
 .../test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip  | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 36cb527a9c806..2b6fcb1fd479b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -180,7 +180,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32, "iiQbiiIi", "")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32, "ffQbiiIi", "", "atomic-fadd-rtn-insts")
-TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "t", "atomic-buffer-global-pk-add-f16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
diff --git a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
index 8ee64d486f4f4..fea86162c801d 100644
--- a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
+++ b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
@@ -14,5 +14,9 @@ __device__ void test_raw_ptr_atomics(__amdgpu_buffer_rsrc_t rsrc, int i32, float
 __device__ void test_raw_ptr_atomics_err(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
   i32 = __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32(i32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
   f32 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32(f32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
-  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4);
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
+}
+
+__device__ void test_raw_ptr_atomics_f16_retty(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0);
 }

From 3154a841be807943fc83604ab8b2d9ecf300ac21 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 5 Nov 2025 16:44:58 +0000
Subject: [PATCH 24/61] [AMDGPU] Autogenerate R600 packetizer checks (#166570)

---
 llvm/test/CodeGen/AMDGPU/packetizer.ll | 52 ++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f811434..b9bf13886d366 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,13 +1,49 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK: {{^}}test:
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
-; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM
 
 define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+; R600-LABEL: test:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     ADD_INT T0.Y, KC0[3].X, 1,
+; R600-NEXT:     ADD_INT T0.Z, KC0[3].Y, 1,
+; R600-NEXT:     ADD_INT T0.W, KC0[2].Z, 1,
+; R600-NEXT:     ADD_INT * T1.W, KC0[2].W, 1,
+; R600-NEXT:     BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
+; R600-NEXT:     OR_INT T0.W, PV.W, PV.Z,
+; R600-NEXT:     OR_INT * T1.W, PV.Y, PV.X,
+; R600-NEXT:     OR_INT T0.X, PS, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: test:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     ADD_INT T0.X, KC0[3].X, 1,
+; CM-NEXT:     ADD_INT T0.Y, KC0[3].Y, 1,
+; CM-NEXT:     ADD_INT T0.Z, KC0[2].Z, 1,
+; CM-NEXT:     ADD_INT * T0.W, KC0[2].W, 1,
+; CM-NEXT:     BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
+; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT:     OR_INT * T0.W, PV.Y, PV.X,
+; CM-NEXT:     OR_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1

From d4e3a2327da11e07961117b3b443de24a8d80095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 5 Nov 2025 18:45:33 +0200
Subject: [PATCH 25/61] [clang] [doc] Document that the ms_abi attribute works
 on aarch64 too (#166373)

Since 022e782e75766e9dd98b9e18572129cd313f3ab5 (2017) this attribute has
an effect on both aarch64 and x86_64; update the docs to reflect this.
---
 clang/include/clang/Basic/AttrDocs.td | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2fdd041c1b46e..1be9a96aa44de 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3450,9 +3450,9 @@ Mac, and BSD. This attribute has no effect on other targets.
 def MSABIDocs : Documentation {
   let Category = DocCatCallingConvs;
   let Content = [{
-On non-Windows x86_64 targets, this attribute changes the calling convention of
-a function to match the default convention used on Windows x86_64. This
-attribute has no effect on Windows targets or non-x86_64 targets.
+On non-Windows x86_64 and aarch64 targets, this attribute changes the calling convention of
+a function to match the default convention used on Windows. This
+attribute has no effect on Windows targets or non-x86_64, non-aarch64 targets.
   }];
 }
 

From 95c87505255032c1cfcd4091e1e114865f62be9a Mon Sep 17 00:00:00 2001
From: Joshua Rodriguez <josh.rodriguez@arm.com>
Date: Wed, 5 Nov 2025 16:53:45 +0000
Subject: [PATCH 26/61] [AArch64][GlobalISel] Added pmull/pmull64 intrinsic
 support (#165740)

GISel no longer falls back onto SDAG when attempting to lower the pmull
and pmull64 intrinsics.
---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |   7 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   3 +
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |   1 +
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |  84 ++++++---
 .../test/CodeGen/AArch64/arm64-neon-3vdiff.ll |  55 +++---
 llvm/test/CodeGen/AArch64/arm64-vmul.ll       | 161 +++++++++++-------
 .../CodeGen/AArch64/highextractbitcast.ll     |  26 +--
 7 files changed, 219 insertions(+), 118 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03f7a69a..52b216c7fe0f0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_PMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 def G_UADDLP : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
@@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
 
 def : GINodeEquiv<G_BSP, AArch64bsp>;
 
+def : GINodeEquiv<G_PMULL, AArch64pmull>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847bc680e..038ad77ae69b2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(TargetOpcode::G_FMAXNUM);
   case Intrinsic::aarch64_neon_fminnm:
     return LowerBinOp(TargetOpcode::G_FMINNUM);
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_pmull64:
+    return LowerBinOp(AArch64::G_PMULL);
   case Intrinsic::aarch64_neon_smull:
     return LowerBinOp(AArch64::G_SMULL);
   case Intrinsic::aarch64_neon_umull:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6d2d70511e894..6b920f05227ad 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
+  case AArch64::G_PMULL:
     return true;
   case TargetOpcode::G_INTRINSIC:
     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e599817..e85e808921c87 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
@@ -1832,14 +1829,33 @@ entry:
 }
 
 define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <8 x i16>, ptr %5, align 4
   %7 = trunc <8 x i16> %6 to <8 x i8>
@@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
 }
 
 define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    pmull v3.8h, v0.8b, v2.8b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.8h, v3.8h, v0.8h
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q2, q3, [x1]
+; CHECK-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v3.8b, v3.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    pmull v2.8h, v4.8b, v3.8b
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <8 x i16>, ptr %3, align 4
   %6 = trunc <8 x i16> %5 to <8 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 2a8b3ce2ae10b..8cb319b2c3368 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,11 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_vmull_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_p64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p64
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
@@ -2721,14 +2716,24 @@ entry:
 }
 
 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK-LABEL: test_vmull_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
@@ -2736,12 +2741,22 @@ entry:
 }
 
 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK-LABEL: test_vmull_high_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_high_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_high_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index e6df9f2fb2c56..90abc7d389c13 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -2,44 +2,35 @@
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for commutable_pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_lane_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_commutable_pmull_64
+; CHECK-GI:	 warning: Instruction selection used fallback path for sqdmulh_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
 
 define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull8h:
@@ -2895,11 +2886,18 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
 }
 
 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.16b, w0
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_dup_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_dup_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.8b, w0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -2924,12 +2922,20 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs)
 }
 
 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    dup v1.16b, v1.b[0]
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_duplane_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    dup v1.16b, v1.b[0]
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_duplane_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    dup v1.8b, v1.b[0]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -3245,21 +3251,35 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_high_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
@@ -3267,13 +3287,22 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
 }
 
 define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_commutable_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    add v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_commutable_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_commutable_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v2.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-GI-NEXT:    add v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   %2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
   %3 = add <16 x i8> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index df4889b6f09de..bd6c168ce8776 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE
 ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_pmull_high_p8_128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_p8_64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -521,12 +518,12 @@ entry:
 }
 
 define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
-; CHECK-LABEL: test_pmull_high_p8_128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x3
-; CHECK-NEXT:    fmov d1, x1
-; CHECK-NEXT:    pmull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-LE-LABEL: test_pmull_high_p8_128:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    fmov d0, x3
+; CHECK-LE-NEXT:    fmov d1, x1
+; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b
+; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pmull_high_p8_128:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -538,6 +535,15 @@ define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_p8_128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast i128 %aa to <16 x i8>
   %b = bitcast i128 %bb to <16 x i8>

From 0b72899f6db93dab140415e800130c7c82c255b1 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Wed, 5 Nov 2025 16:59:55 +0000
Subject: [PATCH 27/61] [libc][math] Refactor the `math_errhandling` macro
 definition (#166350)

This patch refactors the logic to define each component of the
`math_errhandling` macro.

It assumes that math error handling is supported by the target and the C
library unless otherwise disabled in the preprocessor logic.

In addition to the refactoring, the support for error handling via
exceptions is explicitly disabled for Arm targets with no FPU, that is,
where `__ARM_FP` is not defined. This is because LLVM libc does not
provide a floating-point environment for Arm no-FP configurations (or at
least one with support for FP exceptions).
---
 libc/include/llvm-libc-macros/math-macros.h | 31 ++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 6697ce5b03851..e1b12e3010fe9 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -42,14 +42,37 @@
 #define FP_LLOGBNAN LONG_MAX
 #endif
 
-#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__)
-#define math_errhandling 0
-#elif defined(__NO_MATH_ERRNO__)
-#define math_errhandling (MATH_ERREXCEPT)
+// Math error handling. Target support is assumed to be existent unless
+// explicitly disabled.
+#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__) ||     \
+    defined(__NO_MATH_ERRNO__)
+#define __LIBC_SUPPORTS_MATH_ERRNO 0
+#else
+#define __LIBC_SUPPORTS_MATH_ERRNO 1
+#endif
+
+#if defined(__FAST_MATH__) ||                                                  \
+    ((defined(__arm__) || defined(_M_ARM) || defined(__thumb__) ||             \
+      defined(__aarch64__) || defined(_M_ARM64)) &&                            \
+     !defined(__ARM_FP))
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 0
 #else
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 1
+#endif
+
+#if __LIBC_SUPPORTS_MATH_ERRNO && __LIBC_SUPPORTS_MATH_ERREXCEPT
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
+#elif __LIBC_SUPPORTS_MATH_ERRNO
+#define math_errhandling (MATH_ERRNO)
+#elif __LIBC_SUPPORTS_MATH_ERREXCEPT
+#define math_errhandling (MATH_ERREXCEPT)
+#else
+#define math_errhandling 0
 #endif
 
+#undef __LIBC_SUPPORTS_MATH_ERRNO
+#undef __LIBC_SUPPORTS_MATH_ERREXCEPT
+
 // POSIX math constants
 // https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html
 #define M_E (__extension__ 0x1.5bf0a8b145769p1)

From 9b1719efa063b78a996d837b8b4bcb11ddcffcf8 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 5 Nov 2025 17:04:57 +0000
Subject: [PATCH 28/61] [lldb] Mark single-argument SourceLanguage constructors
 explicit (#166527)

This avoids unintentional comparisons between `SourceLanguage` and
`LanguageType`.

Also marks `operator bool` explicit so we don't implicitly convert to
bool.
---
 lldb/include/lldb/lldb-private-types.h              | 11 ++++++++---
 lldb/source/Breakpoint/BreakpointLocation.cpp       |  2 +-
 lldb/source/Commands/CommandObjectDWIMPrint.cpp     | 13 +++++++------
 lldb/source/Expression/UserExpression.cpp           |  2 +-
 .../Clang/ClangExpressionParser.cpp                 |  2 +-
 lldb/source/Target/StackFrame.cpp                   |  6 +++---
 lldb/source/Target/Target.cpp                       |  2 +-
 7 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h
index b82a2b8aa0574..185467e91bf62 100644
--- a/lldb/include/lldb/lldb-private-types.h
+++ b/lldb/include/lldb/lldb-private-types.h
@@ -102,13 +102,18 @@ struct RegisterSet {
 /// A type-erased pair of llvm::dwarf::SourceLanguageName and version.
 struct SourceLanguage {
   SourceLanguage() = default;
-  SourceLanguage(lldb::LanguageType language_type);
+  explicit SourceLanguage(lldb::LanguageType language_type);
+
   SourceLanguage(uint16_t name, uint32_t version)
       : name(name), version(version) {}
-  SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers)
+
+  explicit SourceLanguage(
+      std::optional<std::pair<uint16_t, uint32_t>> name_vers)
       : name(name_vers ? name_vers->first : 0),
         version(name_vers ? name_vers->second : 0) {}
-  operator bool() const { return name > 0; }
+
+  explicit operator bool() const { return name > 0; }
+
   lldb::LanguageType AsLanguageType() const;
   llvm::StringRef GetDescription() const;
   bool IsC() const;
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index f25209c15e007..25285beb7ffd5 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -251,7 +251,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
     }
 
     m_user_expression_sp.reset(GetTarget().GetUserExpressionForLanguage(
-        condition.GetText(), llvm::StringRef(), language,
+        condition.GetText(), llvm::StringRef(), SourceLanguage{language},
         Expression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
         error));
     if (error.Fail()) {
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index 0d9eb45732161..40f00c90bbbfb 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -95,9 +95,9 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
   StackFrame *frame = m_exe_ctx.GetFramePtr();
 
   // Either the language was explicitly specified, or we check the frame.
-  lldb::LanguageType language = m_expr_options.language;
-  if (language == lldb::eLanguageTypeUnknown && frame)
-    language = frame->GuessLanguage().AsLanguageType();
+  SourceLanguage language{m_expr_options.language};
+  if (!language && frame)
+    language = frame->GuessLanguage();
 
   // Add a hint if object description was requested, but no description
   // function was implemented.
@@ -119,8 +119,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
         "^<\\S+: 0x[[:xdigit:]]{5,}>\\s*$");
 
     if (GetDebugger().GetShowDontUsePoHint() && target_ptr &&
-        (language == lldb::eLanguageTypeSwift ||
-         language == lldb::eLanguageTypeObjC) &&
+        (language.AsLanguageType() == lldb::eLanguageTypeSwift ||
+         language.IsObjC()) &&
         std::regex_match(output.data(), swift_class_regex)) {
 
       result.AppendNote(
@@ -193,7 +193,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
 
   // Second, try `expr` as a persistent variable.
   if (expr.starts_with("$"))
-    if (auto *state = target.GetPersistentExpressionStateForLanguage(language))
+    if (auto *state = target.GetPersistentExpressionStateForLanguage(
+            language.AsLanguageType()))
       if (auto var_sp = state->GetVariable(expr))
         if (auto valobj_sp = var_sp->GetValueObject()) {
           dump_val_object(*valobj_sp);
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index af4b477660eeb..5563eba21777e 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -246,7 +246,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   // language in the target's properties if specified, else default to the
   // langage for the frame.
   if (!language) {
-    if (target->GetLanguage() != lldb::eLanguageTypeUnknown)
+    if (target->GetLanguage())
       language = target->GetLanguage();
     else if (StackFrame *frame = exe_ctx.GetFramePtr())
       language = frame->GetLanguage();
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 990074566be7e..6bab880b4d521 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -1502,7 +1502,7 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution(
     LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__,
               lang.GetDescription().data());
     lldb::ProcessSP process_sp = exe_ctx.GetProcessSP();
-    if (process_sp && lang != lldb::eLanguageTypeUnknown) {
+    if (process_sp && lang) {
       auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType());
       if (runtime)
         runtime->GetIRPasses(custom_passes);
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 2ed58c5331df4..95b515412d693 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1344,18 +1344,18 @@ const char *StackFrame::GetDisplayFunctionName() {
 SourceLanguage StackFrame::GetLanguage() {
   CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit;
   if (cu)
-    return cu->GetLanguage();
+    return SourceLanguage{cu->GetLanguage()};
   return {};
 }
 
 SourceLanguage StackFrame::GuessLanguage() {
   SourceLanguage lang_type = GetLanguage();
 
-  if (lang_type == eLanguageTypeUnknown) {
+  if (!lang_type) {
     SymbolContext sc =
         GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol);
     if (sc.function)
-      lang_type = LanguageType(sc.function->GetMangled().GuessLanguage());
+      lang_type = SourceLanguage(sc.function->GetMangled().GuessLanguage());
     else if (sc.symbol)
       lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage());
   }
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index a23091ad09c6d..e53fc7a1e1bda 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -4945,7 +4945,7 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) {
 
 SourceLanguage TargetProperties::GetLanguage() const {
   const uint32_t idx = ePropertyLanguage;
-  return {GetPropertyAtIndexAs<LanguageType>(idx, {})};
+  return SourceLanguage{GetPropertyAtIndexAs<LanguageType>(idx, {})};
 }
 
 llvm::StringRef TargetProperties::GetExpressionPrefixContents() {

From 056d2c12f75654b4b78c938a5243fa57efbd1547 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 5 Nov 2025 09:10:36 -0800
Subject: [PATCH 29/61] RuntimeLibcalls: Split lowering decisions into
 LibcallLoweringInfo (#164987)

Introduce a new class for the TargetLowering usage. This tracks the
subtarget specific lowering decisions for which libcall to use.
RuntimeLibcallsInfo is a module level property, which may have multiple
implementations of a particular libcall available. This attempts to be
a minimum boilerplate patch to introduce the new concept.

In the future we should have a tablegen way of selecting which
implementations should be used for a subtarget. Currently we
do have some conflicting implementations added, it just happens
to work out that the default cases to prefer is alphabetically
first (plus some of these still are using manual overrides
in TargetLowering constructors).
---
 .../llvm/CodeGen/LibcallLoweringInfo.h        | 66 ++++++++++++++++
 llvm/include/llvm/CodeGen/TargetLowering.h    | 19 +++--
 llvm/include/llvm/IR/RuntimeLibcalls.h        | 60 +++-----------
 llvm/lib/CodeGen/CMakeLists.txt               |  1 +
 llvm/lib/CodeGen/LibcallLoweringInfo.cpp      | 26 +++++++
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  8 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  1 +
 llvm/lib/LTO/LTO.cpp                          |  7 +-
 .../WebAssemblyRuntimeLibcallSignatures.cpp   | 20 +++--
 .../Utils/DeclareRuntimeLibcalls.cpp          |  4 +-
 .../RuntimeLibcallEmitter-calling-conv.td     | 60 +++++++-------
 .../RuntimeLibcallEmitter-conflict-warning.td | 30 +++----
 llvm/test/TableGen/RuntimeLibcallEmitter.td   | 78 +++++++++----------
 .../Util/DeclareRuntimeLibcalls/basic.ll      |  9 ++-
 .../DeclareRuntimeLibcalls/sincos_stret.ll    |  6 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 19 ++---
 16 files changed, 238 insertions(+), 176 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
 create mode 100644 llvm/lib/CodeGen/LibcallLoweringInfo.cpp

diff --git a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
new file mode 100644
index 0000000000000..e8eceeed6aca6
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
@@ -0,0 +1,66 @@
+//===- LibcallLoweringInfo.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class LibcallLoweringInfo {
+private:
+  LLVM_ABI const RTLIB::RuntimeLibcallsInfo &RTLCI;
+  /// Stores the implementation choice for each each libcall.
+  LLVM_ABI RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
+      RTLIB::Unsupported};
+
+public:
+  LLVM_ABI LibcallLoweringInfo(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+  /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
+  LLVM_ABI const char *getLibcallName(RTLIB::Libcall Call) const {
+    // FIXME: Return StringRef
+    return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpls[Call])
+        .data();
+  }
+
+  /// Return the lowering's selection of implementation call for \p Call
+  LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
+    return LibcallImpls[Call];
+  }
+
+  /// Rename the default libcall routine name for the specified libcall.
+  LLVM_ABI void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
+    LibcallImpls[Call] = Impl;
+  }
+
+  // FIXME: Remove this wrapper in favor of directly using
+  // getLibcallImplCallingConv
+  LLVM_ABI CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
+    return RTLCI.LibcallImplCallingConvs[LibcallImpls[Call]];
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall.
+  LLVM_ABI CallingConv::ID
+  getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+    return RTLCI.LibcallImplCallingConvs[Call];
+  }
+
+  /// Return a function impl compatible with RTLIB::MEMCPY, or
+  /// RTLIB::Unsupported if fully unsupported.
+  RTLIB::LibcallImpl getMemcpyImpl() const {
+    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
+    if (Memcpy == RTLIB::Unsupported) {
+      // Fallback to memmove if memcpy isn't available.
+      return getLibcallImpl(RTLIB::MEMMOVE);
+    }
+
+    return Memcpy;
+  }
+};
+
+} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b229659415d55..2550c2bee5f71 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
 #include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
@@ -3597,7 +3598,7 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const {
-    return Libcalls;
+    return RuntimeLibcallInfo;
   }
 
   void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
@@ -3610,9 +3611,9 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    // FIXME: Return StringRef
-    return Libcalls.getLibcallName(Call).data();
+    return Libcalls.getLibcallName(Call);
   }
 
   /// Get the libcall routine name for the specified libcall implementation
@@ -3625,7 +3626,7 @@ class LLVM_ABI TargetLoweringBase {
   /// Check if this is valid libcall for the current module, otherwise
   /// RTLIB::Unsupported.
   RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const {
-    return Libcalls.getSupportedLibcallImpl(FuncName);
+    return RuntimeLibcallInfo.getSupportedLibcallImpl(FuncName);
   }
 
   /// Get the comparison predicate that's to be used to test the result of the
@@ -3633,11 +3634,6 @@ class LLVM_ABI TargetLoweringBase {
   /// floating-point compare libcalls.
   ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const;
 
-  /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
-    Libcalls.setLibcallImplCallingConv(Call, CC);
-  }
-
   /// Get the CallingConv that should be used for the specified libcall
   /// implementation.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
@@ -3834,8 +3830,11 @@ class LLVM_ABI TargetLoweringBase {
   std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
     PromoteToType;
 
+  /// FIXME: This should not live here; it should come from an analysis.
+  const RTLIB::RuntimeLibcallsInfo RuntimeLibcallInfo;
+
   /// The list of libcalls that the target will use.
-  RTLIB::RuntimeLibcallsInfo Libcalls;
+  LibcallLoweringInfo Libcalls;
 
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index bae760b3f981d..78e4b1723aafa 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -42,6 +42,8 @@ template <> struct enum_iteration_traits<RTLIB::LibcallImpl> {
   static constexpr bool is_iterable = true;
 };
 
+class LibcallLoweringInfo;
+
 namespace RTLIB {
 
 // Return an iterator over all Libcall values.
@@ -70,6 +72,8 @@ struct RuntimeLibcallsInfo {
   LibcallImplBitset AvailableLibcallImpls;
 
 public:
+  friend class llvm::LibcallLoweringInfo;
+
   explicit RuntimeLibcallsInfo(
       const Triple &TT,
       ExceptionHandling ExceptionModel = ExceptionHandling::None,
@@ -85,17 +89,6 @@ struct RuntimeLibcallsInfo {
     initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
   }
 
-  /// Rename the default libcall routine name for the specified libcall.
-  void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
-    LibcallImpls[Call] = Impl;
-  }
-
-  /// Get the libcall routine name for the specified libcall.
-  // FIXME: This should be removed. Only LibcallImpl should have a name.
-  StringRef getLibcallName(RTLIB::Libcall Call) const {
-    return getLibcallImplName(LibcallImpls[Call]);
-  }
-
   /// Get the libcall routine name for the specified libcall implementation.
   static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
     if (CallImpl == RTLIB::Unsupported)
@@ -105,42 +98,24 @@ struct RuntimeLibcallsInfo {
                      RuntimeLibcallNameSizeTable[CallImpl]);
   }
 
-  /// Return the lowering's selection of implementation call for \p Call
-  RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
-    return LibcallImpls[Call];
-  }
-
   /// Set the CallingConv that should be used for the specified libcall
   /// implementation
   void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
     LibcallImplCallingConvs[Call] = CC;
   }
 
-  // FIXME: Remove this wrapper in favor of directly using
-  // getLibcallImplCallingConv
-  CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-    return LibcallImplCallingConvs[LibcallImpls[Call]];
-  }
-
   /// Get the CallingConv that should be used for the specified libcall.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
     return LibcallImplCallingConvs[Call];
   }
 
-  ArrayRef<RTLIB::LibcallImpl> getLibcallImpls() const {
-    // Trim UNKNOWN_LIBCALL from the back
-    return ArrayRef(LibcallImpls).drop_back();
+  /// Return the libcall provided by \p Impl
+  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
+    return ImplToLibcall[Impl];
   }
 
-  /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
-  /// unsupported.
-  RTLIB::LibcallImpl getMemcpyImpl() const {
-    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
-    if (Memcpy != RTLIB::Unsupported)
-      return Memcpy;
-
-    // Fallback to memmove if memcpy isn't available.
-    return getLibcallImpl(RTLIB::MEMMOVE);
+  unsigned getNumAvailableLibcallImpls() const {
+    return AvailableLibcallImpls.count();
   }
 
   bool isAvailable(RTLIB::LibcallImpl Impl) const {
@@ -151,11 +126,6 @@ struct RuntimeLibcallsInfo {
     AvailableLibcallImpls.set(Impl);
   }
 
-  /// Return the libcall provided by \p Impl
-  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
-    return ImplToLibcall[Impl];
-  }
-
   /// Check if a function name is a recognized runtime call of any kind. This
   /// does not consider if this call is available for any current compilation,
   /// just that it is a known call somewhere. This returns the set of all
@@ -176,11 +146,8 @@ struct RuntimeLibcallsInfo {
   LLVM_ABI RTLIB::LibcallImpl
       getSupportedLibcallImpl(StringRef FuncName) const {
     for (RTLIB::LibcallImpl Impl : lookupLibcallImplName(FuncName)) {
-      // FIXME: This should not depend on looking up ImplToLibcall, only the
-      // list of libcalls for the module.
-      RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
-      if (Recognized != RTLIB::Unsupported)
-        return Recognized;
+      if (isAvailable(Impl))
+        return Impl;
     }
 
     return RTLIB::Unsupported;
@@ -197,10 +164,6 @@ struct RuntimeLibcallsInfo {
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
 
-  /// Stores the implementation choice for each each libcall.
-  RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
-      RTLIB::Unsupported};
-
   static_assert(static_cast<int>(CallingConv::C) == 0,
                 "default calling conv should be encoded as 0");
 
@@ -274,6 +237,7 @@ struct RuntimeLibcallsInfo {
 };
 
 } // namespace RTLIB
+
 } // namespace llvm
 
 #endif // LLVM_IR_RUNTIME_LIBCALLS_H
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 4373c5397a3c6..1cf0b4964760b 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen
   LatencyPriorityQueue.cpp
   LazyMachineBlockFrequencyInfo.cpp
   LexicalScopes.cpp
+  LibcallLoweringInfo.cpp
   LiveDebugVariables.cpp
   LiveIntervals.cpp
   LiveInterval.cpp
diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
new file mode 100644
index 0000000000000..5c1698cb6060e
--- /dev/null
+++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
@@ -0,0 +1,26 @@
+//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
+
+using namespace llvm;
+
+LibcallLoweringInfo::LibcallLoweringInfo(
+    const RTLIB::RuntimeLibcallsInfo &RTLCI)
+    : RTLCI(RTLCI) {
+  // TODO: This should be generated with lowering predicates, and assert the
+  // call is available.
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (RTLCI.isAvailable(Impl)) {
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      // FIXME: Hack, assume the first available libcall wins.
+      if (LibcallImpls[LC] == RTLIB::Unsupported)
+        LibcallImpls[LC] = Impl;
+    }
+  }
+}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b3535eaca5e9d..1cc591c17f9c3 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -697,9 +697,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
-                       TM.Options.FloatABIType, TM.Options.EABIVersion,
-                       TM.Options.MCOptions.getABIName()) {
+    : TM(tm),
+      RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel,
+                         TM.Options.FloatABIType, TM.Options.EABIVersion,
+                         TM.Options.MCOptions.getABIName()),
+      Libcalls(RuntimeLibcallInfo) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 2ce5719228a0d..2fb01a4f95fea 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -19,6 +19,7 @@
 using namespace llvm;
 using namespace RTLIB;
 
+#define GET_RUNTIME_LIBCALLS_INFO
 #define GET_INIT_RUNTIME_LIBCALL_NAMES
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 23be42f9d60ce..fefc733fa7697 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1396,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
   SmallVector<const char *> LibcallSymbols;
-  ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls();
-  LibcallSymbols.reserve(LibcallImpls.size());
+  LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls());
 
-  for (RTLIB::LibcallImpl Impl : LibcallImpls) {
-    if (Impl != RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (Libcalls.isAvailable(Impl))
       LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 45b0e7dc12263..f3c236ca8c9ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -532,13 +532,19 @@ struct StaticLibcallNameMap {
     // FIXME: This is broken if there are ever different triples compiled with
     // different libcalls.
     RTLIB::RuntimeLibcallsInfo RTCI(TT);
-    for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      StringRef NameLibcall = RTCI.getLibcallName(LC);
-      if (!NameLibcall.empty() &&
-          getRuntimeLibcallSignatures().Table[LC] != unsupported) {
-        assert(!Map.contains(NameLibcall) &&
-               "duplicate libcall names in name map");
-        Map[NameLibcall] = LC;
+
+    ArrayRef<RuntimeLibcallSignature> Table =
+        getRuntimeLibcallSignatures().Table;
+    for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+      if (!RTCI.isAvailable(Impl))
+        continue;
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      if (Table[LC] != unsupported) {
+        StringRef NameLibcall =
+            RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl);
+        // FIXME: Map should be to LibcallImpl
+        if (!Map.insert({NameLibcall, LC}).second)
+          llvm_unreachable("duplicate libcall names in name map");
       }
     }
   }
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 6d4436b92c119..dd8706cfb2855 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -54,8 +54,8 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
   const DataLayout &DL = M.getDataLayout();
   const Triple &TT = M.getTargetTriple();
 
-  for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
-    if (Impl == RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (!RTLCI.isAvailable(Impl))
       continue;
 
     auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
index 2904474f6110b..e4a7126d79fbd 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
@@ -53,21 +53,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:        {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:        {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:        RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:        RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:      setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
@@ -80,21 +80,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
-// CHECK-NEXT:   static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:       {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:       {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:   static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:       RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:       RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:   };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:     setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
@@ -107,33 +107,33 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isFoo() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___divmodqi4, // __divmodqi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_3_AVR_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_3_AVR_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isBar() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::MSP430_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
index f9a148a183806..82206ce6ba254 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
@@ -31,12 +31,12 @@ def dup1 : RuntimeLibcallImpl<ANOTHER_DUP>;
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
 
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_func_b, // func_b
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -53,13 +53,13 @@ def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:        RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -76,14 +76,14 @@ def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:     static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:         {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1
-// CHECK-NEXT:         {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:         {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:     static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:         RTLIB::impl_dup1, // dup1
+// CHECK-NEXT:         RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:         RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:     };
 // CHECK-EMPTY:
-// CHECK-NEXT:     for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:       setLibcallImpl(Func, Impl);
+// CHECK-NEXT:     for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:       setAvailable(Impl);
 // CHECK-NEXT:     }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 7aaf3a0e8e1cf..2a1cc72efcd4b 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -200,10 +200,6 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: }
 
 // CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, ExceptionHandling ExceptionModel, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
-// CHECK-NEXT:  struct LibcallImplPair {
-// CHECK-NEXT:    RTLIB::Libcall Func;
-// CHECK-NEXT:    RTLIB::LibcallImpl Impl;
-// CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT: if (TT.getArch() == Triple::blah) {
 // CHECK-NEXT:     static constexpr LibcallImplBitset SystemAvailableImpls({
@@ -211,35 +207,35 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:     });
 // CHECK-NEXT:     AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.hasCompilerRT()) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_hasCompilerRT[] = {
-// CHECK-NEXT:          {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:          {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_hasCompilerRT[] = {
+// CHECK-NEXT:          RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:          RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_hasCompilerRT) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_hasCompilerRT) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -253,14 +249,14 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
@@ -272,22 +268,22 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -301,15 +297,15 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index c005316f07f06..4c8c829a59f3c 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -10,10 +10,15 @@ define float @sinf(float %x) {
   ret float %x
 }
 
+; CHECK: declare void @_Unwind_Resume(...)
+
+; CHECK: declare void @__umodti3(...)
+
 ; CHECK: declare void @acosf(...)
 
-; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
 ; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
 
-; CHECK: declare void @__umodti3(...)
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+
+; CHECK: declare void @truncl(...)
 
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
index f0f09e97d9dba..57cb016bcb7f3 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -7,14 +7,14 @@
 ; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %}
 ; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %}
 
-; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 ; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 
-; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 ; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 
-; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
 ; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
 
 ; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
 ; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 6a36f471678bf..001ca7b658d3c 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -544,11 +544,8 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
   OS << "void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets("
         "const llvm::Triple &TT, ExceptionHandling ExceptionModel, "
         "FloatABI::ABIType FloatABI, EABI EABIVersion, "
-        "StringRef ABIName) {\n"
-        "  struct LibcallImplPair {\n"
-        "    RTLIB::Libcall Func;\n"
-        "    RTLIB::LibcallImpl Impl;\n"
-        "  };\n";
+        "StringRef ABIName) {\n";
+
   ArrayRef<const Record *> AllLibs =
       Records.getAllDerivedDefinitions("SystemRuntimeLibrary");
 
@@ -703,7 +700,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       Funcs.erase(UniqueI, Funcs.end());
 
       OS << indent(IndentDepth + 2)
-         << "static const LibcallImplPair LibraryCalls";
+         << "static const RTLIB::LibcallImpl LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
@@ -711,18 +708,18 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       OS << "[] = {\n";
       for (const RuntimeLibcallImpl *LibCallImpl : Funcs) {
         OS << indent(IndentDepth + 6);
-        LibCallImpl->emitTableEntry(OS);
+        LibCallImpl->emitEnumEntry(OS);
+        OS << ", // " << LibCallImpl->getLibcallFuncName() << '\n';
       }
 
       OS << indent(IndentDepth + 2) << "};\n\n"
          << indent(IndentDepth + 2)
-         << "for (const auto [Func, Impl] : LibraryCalls";
+         << "for (const RTLIB::LibcallImpl Impl : LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
 
-      OS << ") {\n"
-         << indent(IndentDepth + 4) << "setLibcallImpl(Func, Impl);\n";
+      OS << ") {\n" << indent(IndentDepth + 4) << "setAvailable(Impl);\n";
 
       if (FuncsWithCC.CallingConv) {
         StringRef CCEnum =
@@ -759,7 +756,7 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
   emitGetInitRuntimeLibcallNames(OS);
 
   {
-    IfDefEmitter IfDef(OS, "GET_SET_TARGET_RUNTIME_LIBCALL_SETS");
+    IfDefEmitter IfDef(OS, "GET_RUNTIME_LIBCALLS_INFO");
     emitSystemRuntimeLibrarySetCalls(OS);
   }
 }

From dd8892300e7279e4b3ea5e085defe14d4849626f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 5 Nov 2025 09:17:58 -0800
Subject: [PATCH 30/61] RuntimeLibcalls: Remove LibcallLoweringPredicate from
 RuntimeLibcallImpl (#166585)

This is unused and will not make sense.
---
 llvm/include/llvm/IR/RuntimeLibcallsImpl.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
index b5752c1b69ad8..92853125379f5 100644
--- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
+++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
@@ -61,7 +61,6 @@ class RuntimeLibcall {
 class RuntimeLibcallImpl<RuntimeLibcall P, string Name = NAME> {
   RuntimeLibcall Provides = P;
   string LibCallFuncName = Name;
-  list<LibcallLoweringPredicate> LoweringPredicates;
   bit IsDefault = false;
 }
 

From 6312d2751144bd53af7ef56798cbe60aa8b2fb56 Mon Sep 17 00:00:00 2001
From: Finn Plummer <mail@inbelic.dev>
Date: Wed, 5 Nov 2025 09:18:49 -0800
Subject: [PATCH 31/61] [DirectX] Emit `hlsl.wavesize` function attribute as
 entry property metadata (#165624)

This pr adds support for emitting the `hlsl.wavesize` function attribute
as an entry property metadata for a compute shader.

It follows the implementation of `hlsl.numthreads`.

- Collects the wave range information from the function attribute in
`DXILMetadataAnalysis`
- Introduce the `WaveRange` property tag
- Emit a `WaveSize` or `WaveRange` metadata (depending on shader model)
in `DXILTranslateMetadata`
- Add tests for valid/invalid scenarios
- Updates the base `PSVInfo` to reflect the min/max wave lane counts

Resolves #70118
---
 .../llvm/Analysis/DXILMetadataAnalysis.h      |  3 +
 llvm/lib/Analysis/DXILMetadataAnalysis.cpp    | 16 ++++
 .../lib/Target/DirectX/DXContainerGlobals.cpp |  7 ++
 .../Target/DirectX/DXILTranslateMetadata.cpp  | 66 ++++++++++---
 llvm/test/CodeGen/DirectX/wavesize-md-errs.ll | 31 ++++++
 .../test/CodeGen/DirectX/wavesize-md-valid.ll | 96 +++++++++++++++++++
 6 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
 create mode 100644 llvm/test/CodeGen/DirectX/wavesize-md-valid.ll

diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
index cb535ac14f1c6..a1b030c157eae 100644
--- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
+++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
@@ -27,6 +27,9 @@ struct EntryProperties {
   unsigned NumThreadsX{0}; // X component
   unsigned NumThreadsY{0}; // Y component
   unsigned NumThreadsZ{0}; // Z component
+  unsigned WaveSizeMin{0}; // Minimum component
+  unsigned WaveSizeMax{0}; // Maximum component
+  unsigned WaveSizePref{0}; // Preferred component
 
   EntryProperties(const Function *Fn = nullptr) : Entry(Fn) {};
 };
diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
index 23f1aa82ae8a3..bd77cba385667 100644
--- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
+++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
@@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) {
       Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10);
       assert(Success && "Failed to parse Z component of numthreads");
     }
+    // Get wavesize attribute value, if one exists
+    StringRef WaveSizeStr =
+        F.getFnAttribute("hlsl.wavesize").getValueAsString();
+    if (!WaveSizeStr.empty()) {
+      SmallVector<StringRef> WaveSizeVec;
+      WaveSizeStr.split(WaveSizeVec, ',');
+      assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified");
+      // Read in the three component values of numthreads
+      [[maybe_unused]] bool Success =
+          llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10);
+      assert(Success && "Failed to parse Min component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10);
+      assert(Success && "Failed to parse Max component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10);
+      assert(Success && "Failed to parse Preferred component of wavesize");
+    }
     MMDAI.EntryPropertyVec.push_back(EFP);
   }
   return MMDAI;
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index eb4c8846441a2..677203d1c016b 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -285,6 +285,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
     PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
     PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
     PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+    if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+      PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+      PSV.BaseData.MaximumWaveLaneCount =
+          MMI.EntryPropertyVec[0].WaveSizeMax
+              ? MMI.EntryPropertyVec[0].WaveSizeMax
+              : MMI.EntryPropertyVec[0].WaveSizeMin;
+    }
     break;
   default:
     break;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833b3e42e..e1a472fe57642 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
   ASStateTag,
   WaveSize,
   EntryRootSig,
+  WaveRange = 23,
 };
 
 } // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
   case EntryPropsTag::ASStateTag:
   case EntryPropsTag::WaveSize:
   case EntryPropsTag::EntryRootSig:
+  case EntryPropsTag::WaveRange:
     llvm_unreachable("NYI: Unhandled entry property tag");
   }
   return MDVals;
 }
 
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
-                       const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+                                       uint64_t EntryShaderFlags,
+                                       const ModuleMetadataInfo &MMDI) {
   SmallVector<Metadata *> MDVals;
   LLVMContext &Ctx = EP.Entry->getContext();
   if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
     // FIXME: support more props.
     // See https://github.com/llvm/llvm-project/issues/57948.
     // Add shader kind for lib entries.
-    if (ShaderProfile == Triple::EnvironmentType::Library &&
+    if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
         EP.ShaderStage != Triple::EnvironmentType::Library)
       MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
                                           getShaderStage(EP.ShaderStage), Ctx));
 
     if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+      // Handle mandatory "hlsl.numthreads"
       MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
           Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
       Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
                                    ConstantAsMetadata::get(ConstantInt::get(
                                        Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
       MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+      // Handle optional "hlsl.wavesize". The fields are optionally represented
+      // if they are non-zero.
+      if (EP.WaveSizeMin != 0) {
+        bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+        bool IsWaveSize =
+            !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+        if (!IsWaveRange && !IsWaveSize) {
+          reportError(M, "Shader model 6.6 or greater is required to specify "
+                         "the \"hlsl.wavesize\" function attribute");
+          return nullptr;
+        }
+
+        // A range is being specified if EP.WaveSizeMax != 0
+        if (EP.WaveSizeMax && !IsWaveRange) {
+          reportError(
+              M, "Shader model 6.8 or greater is required to specify "
+                 "wave size range values of the \"hlsl.wavesize\" function "
+                 "attribute");
+          return nullptr;
+        }
+
+        EntryPropsTag Tag =
+            IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+        MDVals.emplace_back(ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+        SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+        if (IsWaveRange) {
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+        }
+
+        MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+      }
     }
   }
+
   if (MDVals.empty())
     return nullptr;
   return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
   return MDNode::get(Ctx, MDVals);
 }
 
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
-                            MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+                            MDTuple *Signatures, MDNode *MDResources,
                             const uint64_t EntryShaderFlags,
-                            const Triple::EnvironmentType ShaderProfile) {
-  MDTuple *Properties =
-      getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+                            const ModuleMetadataInfo &MMDI) {
+  MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
   return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
                                 EP.Entry->getContext());
 }
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
                    Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
                          "'"));
     }
-
-    EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
-                                            EntryShaderFlags,
-                                            MMDI.ShaderProfile));
+    EntryFnMDNodes.emplace_back(emitEntryMD(
+        M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
   }
 
   NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
new file mode 100644
index 0000000000000..9016c5d7e8d44
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm.ll 2>&1 | FileCheck %t/low-sm.ll
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm-for-range.ll 2>&1 | FileCheck %t/low-sm-for-range.ll
+
+; Test that wavesize metadata is only allowed on applicable shader model versions
+
+;--- low-sm.ll
+
+; CHECK: Shader model 6.6 or greater is required to specify the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- low-sm-for-range.ll
+
+; CHECK: Shader model 6.8 or greater is required to specify wave size range values of the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.7-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
new file mode 100644
index 0000000000000..3ad6c1d034252
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
@@ -0,0 +1,96 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/only.ll | FileCheck %t/only.ll
+; RUN: opt -S --dxil-translate-metadata %t/min.ll | FileCheck %t/min.ll
+; RUN: opt -S --dxil-translate-metadata %t/max.ll | FileCheck %t/max.ll
+; RUN: opt -S --dxil-translate-metadata %t/pref.ll | FileCheck %t/pref.ll
+
+; RUN: llc --filetype=obj %t/only.ll -o - | obj2yaml | FileCheck %t/only.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/min.ll -o - | obj2yaml | FileCheck %t/min.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/max.ll -o - | obj2yaml | FileCheck %t/max.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/pref.ll -o - | obj2yaml | FileCheck %t/pref.ll --check-prefix=OBJ
+
+; Test that wave size/range metadata is correctly generated with the correct tag
+
+;--- only.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 11, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.6-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- min.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 0, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- max.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 32, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 32
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- pref.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 64, i32 32}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 64
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,64,32" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }

From a1f0fe140a0e8f15dcd33df42b2c8cf170f69db8 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Wed, 5 Nov 2025 18:19:43 +0100
Subject: [PATCH 32/61] [MsDemangle] Use NodeList over SmallVector for target
 names (#166586)

Using `SmallVector` would introduce a dependency cycle (see
https://github.com/llvm/llvm-project/pull/155630#discussion_r2495268497),
so this uses a NodeList.
---
 llvm/lib/Demangle/MicrosoftDemangle.cpp | 50 ++++++++++++-------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index 250d382998982..0aefe6e077c24 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -15,8 +15,6 @@
 
 #include "llvm/Demangle/MicrosoftDemangle.h"
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Demangle/DemangleConfig.h"
 #include "llvm/Demangle/MicrosoftDemangleNodes.h"
@@ -279,13 +277,16 @@ demanglePointerCVQualifiers(std::string_view &MangledName) {
   DEMANGLE_UNREACHABLE;
 }
 
-static NodeArrayNode *smallVecToNodeArray(ArenaAllocator &Arena,
-                                          ArrayRef<Node *> Vec) {
-  NodeArrayNode *Arr = Arena.alloc<NodeArrayNode>();
-  Arr->Count = Vec.size();
-  Arr->Nodes = Arena.allocArray<Node *>(Vec.size());
-  std::memcpy(Arr->Nodes, Vec.data(), Vec.size() * sizeof(Node *));
-  return Arr;
+static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
+                                          size_t Count) {
+  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
+  N->Count = Count;
+  N->Nodes = Arena.allocArray<Node *>(Count);
+  for (size_t I = 0; I < Count; ++I) {
+    N->Nodes[I] = Head->N;
+    Head = Head->Next;
+  }
+  return N;
 }
 
 std::string_view Demangler::copyString(std::string_view Borrowed) {
@@ -335,17 +336,28 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName,
 
   std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName);
 
-  SmallVector<Node *, 1> TargetNames;
+  NodeList *TargetCurrent = nullptr;
+  NodeList *TargetHead = nullptr;
+  size_t Count = 0;
   while (!consumeFront(MangledName, '@')) {
+    ++Count;
+
+    NodeList *Next = Arena.alloc<NodeList>();
+    if (TargetCurrent)
+      TargetCurrent->Next = Next;
+    else
+      TargetHead = Next;
+
+    TargetCurrent = Next;
     QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName);
     if (Error)
       return nullptr;
     assert(QN);
-    TargetNames.push_back(QN);
+    TargetCurrent->N = QN;
   }
 
-  if (!TargetNames.empty())
-    STSN->TargetNames = smallVecToNodeArray(Arena, TargetNames);
+  if (Count > 0)
+    STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count);
 
   return STSN;
 }
@@ -1627,18 +1639,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) {
   return demangleSimpleName(MangledName, /*Memorize=*/true);
 }
 
-static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
-                                          size_t Count) {
-  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
-  N->Count = Count;
-  N->Nodes = Arena.allocArray<Node *>(Count);
-  for (size_t I = 0; I < Count; ++I) {
-    N->Nodes[I] = Head->N;
-    Head = Head->Next;
-  }
-  return N;
-}
-
 QualifiedNameNode *
 Demangler::demangleNameScopeChain(std::string_view &MangledName,
                                   IdentifierNode *UnqualifiedName) {

From 9564b26f81f481f91299ebc446011ed4e5407400 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 5 Nov 2025 09:20:10 -0800
Subject: [PATCH 33/61] [lldb] Support a Stable ABI LLDB_PYTHON_EXT_SUFFIX
 (#166269)

When building against the Python Stable API, we should use the `abi3`
ABI tag. Otherwise, Python will refuse to import the native shared
object. This PR adds support for generating a stable ABI compatible
suffix when `LLDB_ENABLE_PYTHON_LIMITED_API` is set.

Previously, on Darwin when building against Python 3.14, you would end
up with `_lldb.cpython-314-darwin.so`. Now, when using the stable ABI,
you get `_lldb.abi3.so` instead. A different version of the Python
interpreter will not consider loading the former, but will load the
latter.
---
 lldb/CMakeLists.txt                       | 5 +++++
 lldb/bindings/python/get-python-config.py | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 01b5546fee00d..0736e6ba132c8 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -62,11 +62,16 @@ if (LLDB_ENABLE_PYTHON)
   set(cachestring_LLDB_PYTHON_EXT_SUFFIX
     "Filename extension for native code python modules")
 
+  if (LLDB_ENABLE_PYTHON_LIMITED_API)
+    set(stable_abi "--stable-abi")
+  endif()
+
   foreach(var LLDB_PYTHON_RELATIVE_PATH LLDB_PYTHON_EXE_RELATIVE_PATH LLDB_PYTHON_EXT_SUFFIX)
     if(NOT DEFINED ${var} AND NOT CMAKE_CROSSCOMPILING)
       execute_process(
         COMMAND ${Python3_EXECUTABLE}
           ${CMAKE_CURRENT_SOURCE_DIR}/bindings/python/get-python-config.py
+          ${stable_abi}
           ${var}
         OUTPUT_VARIABLE value
         OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/lldb/bindings/python/get-python-config.py b/lldb/bindings/python/get-python-config.py
index ae84cbb1215a9..bf8cc48b013e1 100755
--- a/lldb/bindings/python/get-python-config.py
+++ b/lldb/bindings/python/get-python-config.py
@@ -18,6 +18,9 @@ def relpath_nodots(path, base):
 def main():
     parser = argparse.ArgumentParser(description="extract cmake variables from python")
     parser.add_argument("variable_name")
+    parser.add_argument(
+        "--stable-abi", action="store_true", help="Target the Stable C ABI"
+    )
     args = parser.parse_args()
     if args.variable_name == "LLDB_PYTHON_RELATIVE_PATH":
         # LLDB_PYTHON_RELATIVE_PATH is the relative path from lldb's prefix
@@ -68,7 +71,10 @@ def main():
                     print("sys.prefix:", sys.prefix, file=sys.stderr)
                     sys.exit(1)
     elif args.variable_name == "LLDB_PYTHON_EXT_SUFFIX":
-        print(sysconfig.get_config_var("EXT_SUFFIX"))
+        if args.stable_abi:
+            print(".abi3%s" % sysconfig.get_config_var("SHLIB_SUFFIX"))
+        else:
+            print(sysconfig.get_config_var("EXT_SUFFIX"))
     else:
         parser.error(f"unknown variable {args.variable_name}")
 

From f60e69315e9ed94b2b330acb39a766ac86aa1f80 Mon Sep 17 00:00:00 2001
From: Prabhu Rajasekaran <prabhukr@google.com>
Date: Wed, 5 Nov 2025 09:22:08 -0800
Subject: [PATCH 34/61] [llvm] Emit canonical linkage correct function symbol
 (#166487)

In the call graph section, we were emitting the temporary label
pointing to the start of the function instead of the canonical linkage
correct function symbol. This patch fixes it and updates the
corresponding tests.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp            | 3 +--
 llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll | 3 +--
 llvm/test/CodeGen/ARM/call-graph-section-assembly.ll  | 3 +--
 llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll  | 2 +-
 llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll | 3 +--
 llvm/test/CodeGen/X86/call-graph-section-assembly.ll  | 3 +--
 6 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f65d88a669f13..713277d0bc5e0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1708,7 +1708,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   OutStreamer->pushSection();
   OutStreamer->switchSection(FuncCGSection);
 
-  const MCSymbol *FunctionSymbol = getFunctionBegin();
   const Function &F = MF.getFunction();
   // If this function has external linkage or has its address taken and
   // it is not a callback, then anything could call it.
@@ -1747,7 +1746,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   // 8) Each unique indirect target type id.
   OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0);
   OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags));
-  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize());
   const auto *TypeId = extractNumericCGTypeId(F);
   if (IsIndirectTarget && TypeId)
     OutStreamer->emitInt64(TypeId->getZExtValue());
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index cabd43edff9d6..9e243aec1128d 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,7 +32,7 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long _ZL10myCallbacki
 ;; Function type ID -5212364466660467813
 ; CHECK-NEXT: .long	1154849691
 ; CHECK-NEXT: .long	3081369122
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index 3d3974ee6ba3b..8e8881ee722fb 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .long   0
 ; CHECK-NEXT: .long   0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index 80360041c106a..35e570bdde405 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -29,6 +29,6 @@ declare !type !2 i32 @bar(i8 signext)
 
 ; CHECK:      Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
-; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
+; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05000000 00a150b8
 ;; Verify that the type id 0x308e4b8159bc8654 is in section.
 ; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba402421..ab8498d8d3451 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad   _ZL10myCallbacki
 ;; Function type ID
 ; CHECK-NEXT: .quad   -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad668aec54..02d71073b65c5 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .quad   0
 ;; Number of unique direct callees.

From cb41408d3c2de0d79b9c4c39ed2a8639906bc572 Mon Sep 17 00:00:00 2001
From: Andrei Safronov <andrei.safronov@espressif.com>
Date: Wed, 5 Nov 2025 20:27:11 +0300
Subject: [PATCH 35/61] [Xtensa] Fix S32C1I instruction encoding and
 copyPhysReg. (#165174)

Fix S21C1I instruction encoding.Fix special registers parsing for S32C1I
feature. Fix copyPhysReg function for f32 registers copy.
---
 .../MCTargetDesc/XtensaMCCodeEmitter.cpp      |  2 +-
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       |  2 +-
 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp    | 21 +++++++++++++++++--
 llvm/test/CodeGen/Xtensa/s32c1i.ll            |  7 +++++++
 llvm/test/MC/Xtensa/s32c1i.s                  | 13 ++++++++++++
 5 files changed, 41 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/Xtensa/s32c1i.ll
 create mode 100644 llvm/test/MC/Xtensa/s32c1i.s

diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index bd4d4ebd2a729..5977a276b1236 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
   case Xtensa::SSIP:
   case Xtensa::LSI:
   case Xtensa::LSIP:
-
+  case Xtensa::S32C1I:
     if (Res & 0x3) {
       report_fatal_error("Unexpected operand value!");
     }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 4e730707dcb78..8d0fd078b2696 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
     return FeatureBits[Xtensa::FeatureWindowed];
   case Xtensa::ATOMCTL:
   case Xtensa::SCOMPARE1:
-    return FeatureBits[Xtensa::FeatureWindowed];
+    return FeatureBits[Xtensa::FeatureS32C1I];
   case Xtensa::NoRegister:
     return false;
   }
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index b0f924f2cd58e..be69cefb5b78f 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -114,14 +114,31 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   const DebugLoc &DL, Register DestReg,
                                   Register SrcReg, bool KillSrc,
                                   bool RenamableDest, bool RenamableSrc) const {
-  // The MOV instruction is not present in core ISA,
+  unsigned Opcode;
+
+  // The MOV instruction is not present in core ISA for AR registers,
   // so use OR instruction.
-  if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
+  if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+      Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::MOV_S;
+  else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+           Xtensa::ARRegClass.contains(DestReg))
+    Opcode = Xtensa::RFR;
+  else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) &&
+           Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::WFR;
   else
     report_fatal_error("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void XtensaInstrInfo::storeRegToStackSlot(
diff --git a/llvm/test/CodeGen/Xtensa/s32c1i.ll b/llvm/test/CodeGen/Xtensa/s32c1i.ll
new file mode 100644
index 0000000000000..aad738abe6a4c
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/s32c1i.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=xtensa -mattr=+s32c1i  -filetype=obj %s -o - | llvm-objdump --arch=xtensa --mattr=s32c1i -d - | FileCheck %s -check-prefix=XTENSA
+
+define i32 @constraint_i(i32 %a) {
+; XTENSA: 0: 22 e2 01    s32c1i  a2, a2, 4
+  %res = tail call i32 asm "s32c1i $0, $1, $2", "=r,r,i"(i32 %a, i32 4)
+  ret i32 %res
+}
diff --git a/llvm/test/MC/Xtensa/s32c1i.s b/llvm/test/MC/Xtensa/s32c1i.s
new file mode 100644
index 0000000000000..218a86dd56752
--- /dev/null
+++ b/llvm/test/MC/Xtensa/s32c1i.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+s32c1i \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# CHECK-INST: xsr a3, atomctl
+# CHECK: # encoding: [0x30,0x63,0x61]
+xsr a3, atomctl
+
+# CHECK-INST: xsr a3, scompare1
+# CHECK: # encoding: [0x30,0x0c,0x61]
+xsr a3, scompare1

From efa7ab06ebf7e88a3cf1befebf58f24f8b09fcec Mon Sep 17 00:00:00 2001
From: Jin Huang <jinhuang1102@gmail.com>
Date: Wed, 5 Nov 2025 09:33:09 -0800
Subject: [PATCH 36/61] [profcheck] Add unknown branch weights to expanded
 cmpxchg loop. (#165841)

The AtomicExpandPass is responsible for lowering high-level atomic
operations (like `atomicrmw fadd`) that are unsupported by the target
hardware into a cmpxchg retry loop.

Given that we cannot empirically prove the precision branch weights, It
uses the `setExplicitlyUnknownBranchWeightsIfProfiled` function to
explicitly add "unknown" (50/50) branch weights to this branch.

This PR includes fies for the following tests:
```
Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
Transforms/AtomicExpand/AArch64/pcsections.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
Transforms/AtomicExpand/SPARC/libcalls.ll
Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
```

Co-authored-by: Jin Huang <jingold@google.com>
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp              |  7 ++++++-
 .../AtomicExpand/AArch64/atomicrmw-fp.ll           | 14 +++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 0b55c03a46747..bd62b1646c761 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1686,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
 
   Loaded->addIncoming(NewLoaded, LoopBB);
 
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+  Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
+  // cannot be easily determined here, we mark the branch as "unknown" (50/50)
+  // to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return NewLoaded;
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index 8ffacb9bdd5f6..1b728f56ab2ea 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -1,7 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
-define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
+define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) !prof !0 {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
 ; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
@@ -336,3 +336,11 @@ define <2 x half> @atomicrmw_fminimum_2_x_half(ptr %ptr, <2 x half> %val) {
   %res = atomicrmw fminimum ptr %ptr, <2 x half> %val seq_cst
   ret <2 x half> %res
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.

From d49c6707d07389e4bccdb23951dc9d3bc20996b1 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 5 Nov 2025 17:48:47 +0000
Subject: [PATCH 37/61] [libcxxabi][demangle] Fix the cp-to-llvm.sh sync script
 to copy all headers (#166572)

In https://github.com/llvm/llvm-project/pull/137947 I refactored the
script and added a `copy_files` function, which takes the header files
to copy as an argument.

But because the list of headers is a space separated string, we need to
quote the string. Otherwise we would just copy the first header in the
list.

This patch also adds an `echo` statement in the `copy_files` loop to
print the source/destination. Confirming that the files are copied as
expected.
```
$ libcxxabi/src/demangle/cp-to-llvm.sh
This will overwrite the copies of ItaniumDemangle.h ItaniumNodes.def StringViewExtras.h Utility.h in ../../../llvm/include/llvm/Demangle and DemangleTestCases.inc in ../../../llvm/include/llvm/Demangle/../Testing/Demangle; are you sure? [y/N]y
Copying ./ItaniumDemangle.h to ../../../llvm/include/llvm/Demangle/ItaniumDemangle.h
Copying ./ItaniumNodes.def to ../../../llvm/include/llvm/Demangle/ItaniumNodes.def
Copying ./StringViewExtras.h to ../../../llvm/include/llvm/Demangle/StringViewExtras.h
Copying ./Utility.h to ../../../llvm/include/llvm/Demangle/Utility.h
Copying ../../test/DemangleTestCases.inc to ../../../llvm/include/llvm/Demangle/../Testing/Demangle/DemangleTestCases.inc
```

Luckily there weren't any out-of-sync changes introduced in the
meantime.
---
 libcxxabi/src/demangle/cp-to-llvm.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcxxabi/src/demangle/cp-to-llvm.sh b/libcxxabi/src/demangle/cp-to-llvm.sh
index f773dff9f0a8b..9c1db6fec29a6 100755
--- a/libcxxabi/src/demangle/cp-to-llvm.sh
+++ b/libcxxabi/src/demangle/cp-to-llvm.sh
@@ -42,6 +42,7 @@ copy_files() {
     chmod -w $dst/README.txt
 
     for I in $hdrs ; do
+	    echo "Copying ${src}/$I to ${dst}/$I"
 	    rm -f $dst/$I
 	    dash=$(echo "$I---------------------------" | cut -c -27 |\
 		       sed 's|[^-]*||')
@@ -53,6 +54,6 @@ copy_files() {
 }
 
 if [[ $ANSWER =~ ^[Yy]$ ]]; then
-    copy_files . $LLVM_DEMANGLE_DIR $HDRS
-    copy_files ../../test $LLVM_TESTING_DIR $TEST_HDRS
+    copy_files . $LLVM_DEMANGLE_DIR "${HDRS}"
+    copy_files ../../test $LLVM_TESTING_DIR "${TEST_HDRS}"
 fi

From 3c162ba247d30c9d8113e66fe5d96e24156ce797 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Wed, 5 Nov 2025 19:07:44 +0100
Subject: [PATCH 38/61] [LLDB][NativePDB] Add non-overlapping fields in root
 struct (#166243)

When anonymous unions are used in a struct or vice versa, their fields
are merged into the parent record when using PDB. LLDB tries to recreate
the original definition of the record _with_ the anonymous
unions/structs.

For tagged unions (like `std::optional`) where the tag followed the
anonymous union, the result was suboptimal:

```cpp
// input:
struct Foo {
  union {
    Bar b;
    char c;
  };
  bool tag;
};

// reconstructed:
struct Foo {
  union {
    Bar b;
    struct {
      char c;
      bool tag;
    };
  };
};
```

Once the algorithm is in some nested union, it can't get out.

In the above case, we can get to the correct reconstructed record if we
always add fields that don't overlap others in the root struct. So when
we see `tag`, we'll see that it comes after all other fields, so it's
possible to add it in the root `Foo`.
---
 .../NativePDB/UdtRecordCompleter.cpp          | 23 +++++++-
 .../SymbolFile/NativePDB/class_layout.cpp     |  8 +--
 .../NativePDB/UdtRecordCompleterTests.cpp     | 52 ++++++++++++++++++-
 3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index 1c575e90bd72c..46cf9b8524ede 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -442,6 +442,10 @@ void UdtRecordCompleter::Record::ConstructRecord() {
 
   // The end offset to a vector of field/struct that ends at the offset.
   std::map<uint64_t, std::vector<Member *>> end_offset_map;
+  auto is_last_end_offset = [&](auto it) {
+    return it != end_offset_map.end() && ++it == end_offset_map.end();
+  };
+
   for (auto &pair : fields_map) {
     uint64_t offset = pair.first;
     auto &fields = pair.second;
@@ -462,8 +466,23 @@ void UdtRecordCompleter::Record::ConstructRecord() {
       }
       if (iter->second.empty())
         continue;
-      parent = iter->second.back();
-      iter->second.pop_back();
+
+      // If the new fields come after the already added ones
+      // without overlap, go back to the root.
+      if (iter->first <= offset && is_last_end_offset(iter)) {
+        if (record.kind == Member::Struct) {
+          parent = &record;
+        } else {
+          assert(record.kind == Member::Union &&
+                 "Current record must be a union");
+          assert(!record.fields.empty());
+          // For unions, append the field to the last struct
+          parent = record.fields.back().get();
+        }
+      } else {
+        parent = iter->second.back();
+        iter->second.pop_back();
+      }
     }
     // If it's a field, then the field is inside a union, so we can safely
     // increase its size by converting it to a struct to hold multiple fields.
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
index 36bfdb9a8e565..83ed533eb13e3 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
@@ -34,9 +34,6 @@
 // CHECK-NEXT:           s4 = {
 // CHECK-NEXT:             x = ([0] = 67, [1] = 68, [2] = 99)
 // CHECK-NEXT:           }
-// CHECK-NEXT:           s1 = {
-// CHECK-NEXT:             x = ([0] = 69, [1] = 70, [2] = 71)
-// CHECK-NEXT:           }
 // CHECK-NEXT:         }
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }
@@ -47,6 +44,9 @@
 // CHECK-NEXT:       c2 = 'D'
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
+// CHECK-NEXT:   s1 = {
+// CHECK-NEXT:     x = ([0] = 69, [1] = 70, [2] = 71)
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
 // CHECK-NEXT: (lldb) type lookup C
 // CHECK-NEXT: struct C {
@@ -63,7 +63,6 @@
 // CHECK-NEXT:                 struct {
 // CHECK-NEXT:                     char c4;
 // CHECK-NEXT:                     S3 s4;
-// CHECK-NEXT:                     S3 s1;
 // CHECK-NEXT:                 };
 // CHECK-NEXT:             };
 // CHECK-NEXT:         };
@@ -72,6 +71,7 @@
 // CHECK-NEXT:             char c2;
 // CHECK-NEXT:         };
 // CHECK-NEXT:     };
+// CHECK-NEXT:     S3 s1;
 // CHECK-NEXT: }
 
 
diff --git a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
index 17284b61b9a6e..cd6db5fcb1f4c 100644
--- a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
+++ b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
@@ -99,7 +99,7 @@ Member *AddField(Member *member, StringRef name, uint64_t byte_offset,
       std::make_unique<Member>(name, byte_offset * 8, byte_size * 8,
                                clang::QualType(), lldb::eAccessPublic, 0);
   field->kind = kind;
-  field->base_offset = base_offset;
+  field->base_offset = base_offset * 8;
   member->fields.push_back(std::move(field));
   return member->fields.back().get();
 }
@@ -111,6 +111,9 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   CollectMember("m2", 0, 4);
   CollectMember("m3", 0, 1);
   CollectMember("m4", 0, 8);
+  CollectMember("m5", 8, 8);
+  CollectMember("m6", 16, 4);
+  CollectMember("m7", 16, 8);
   ConstructRecord();
 
   // struct {
@@ -120,6 +123,11 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   //       m3;
   //       m4;
   //   };
+  //   m5;
+  //   union {
+  //       m6;
+  //       m7;
+  //   };
   // };
   Record record;
   record.start_offset = 0;
@@ -128,6 +136,10 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   AddField(u, "m2", 0, 4, Member::Field);
   AddField(u, "m3", 0, 1, Member::Field);
   AddField(u, "m4", 0, 8, Member::Field);
+  AddField(&record.record, "m5", 8, 8, Member::Field);
+  Member *u2 = AddField(&record.record, "", 16, 0, Member::Union);
+  AddField(u2, "m6", 16, 4, Member::Field);
+  AddField(u2, "m7", 16, 8, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
 
@@ -243,3 +255,41 @@ TEST_F(UdtRecordCompleterRecordTests, TestNestedUnionStructInUnion) {
   AddField(s2, "m4", 2, 4, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
+
+TEST_F(UdtRecordCompleterRecordTests, TestNestedStructInUnionInStructInUnion) {
+  SetKind(Member::Kind::Union);
+  CollectMember("m1", 0, 4);
+  CollectMember("m2", 0, 2);
+  CollectMember("m3", 0, 2);
+  CollectMember("m4", 2, 4);
+  CollectMember("m5", 6, 2);
+  CollectMember("m6", 6, 2);
+  CollectMember("m7", 8, 2);
+  ConstructRecord();
+
+  // union {
+  //   m1;
+  //   m2;
+  //   struct {
+  //       m3;
+  //       m4;
+  //       union {
+  //           m5;
+  //           m6;
+  //       };
+  //       m7;
+  //   };
+  // };
+  Record record;
+  record.start_offset = 0;
+  AddField(&record.record, "m1", 0, 4, Member::Field);
+  AddField(&record.record, "m2", 0, 2, Member::Field);
+  Member *s = AddField(&record.record, "", 0, 0, Member::Struct);
+  AddField(s, "m3", 0, 2, Member::Field);
+  AddField(s, "m4", 2, 4, Member::Field);
+  Member *u = AddField(s, "", 6, 0, Member::Union);
+  AddField(u, "m5", 6, 2, Member::Field);
+  AddField(u, "m6", 6, 2, Member::Field);
+  AddField(s, "m7", 8, 2, Member::Field);
+  EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
+}

From af0b6b18a8690e98586f342d28bb97a29c0eb45d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim@gymni.ch>
Date: Wed, 5 Nov 2025 19:16:03 +0100
Subject: [PATCH 39/61] [ProfCheck][NFC] fix argument order for call to
 setExplicitlyUnknownBranchWeightsIfProfiled (#166601)

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index bd62b1646c761..d9bc042d6807e 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1691,7 +1691,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
   // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
   // cannot be easily determined here, we mark the branch as "unknown" (50/50)
   // to prevent misleading optimizations.
-  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return NewLoaded;

From ebeb36b12e4649954a62dfbef7a5b04c5d8e52d7 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 5 Nov 2025 13:22:00 -0500
Subject: [PATCH 40/61] [PowerPC] Implement vsx rotate left word instr
 (#160754)

Implement `xvrlw`.
---
 llvm/lib/Target/PowerPC/PPCInstrFuture.td                     | 3 +++
 llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt  | 3 +++
 .../MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt    | 3 +++
 llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s                 | 4 ++++
 4 files changed, 13 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e18f463..dfbbba0116f25 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -420,6 +420,9 @@ let Predicates = [HasVSX, IsISAFuture] in {
       : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
                       "vucmprlh $VRT, $VRA, $VRB", []>;
 
+  def XVRLW: XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvrlw $XT, $XA, $XB", []>;
+
   // AES Acceleration Instructions
   def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp),
                                     (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M),
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 2661ed5b04cc9..b27a50d93f5b9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -250,6 +250,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x10,0x23,0x31,0x03
 
+#CHECK: xvrlw 34, 15, 16
+0xf0,0x4f,0x85,0xc1
+
 #CHECK: xxaes192encp 8, 10, 14
 0xf1,0x0b,0x76,0x10
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 7fb8254ced0ac..72662d9736740 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -244,6 +244,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x03,0x31,0x23,0x10
 
+#CHECK: xvrlw 34, 15, 16
+0xc1,0x85,0x4f,0xf0
+
 #CHECK: xxaes192encp 8, 10, 14
 0x10,0x76,0x0b,0xf1
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index 40059c440b128..ab72649fc3404 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -355,6 +355,10 @@
 #CHECK-BE: vucmprhh 1, 3, 6               # encoding: [0x10,0x23,0x31,0x03]
 #CHECK-LE: vucmprhh 1, 3, 6               # encoding: [0x03,0x31,0x23,0x10]
 
+           xvrlw 34, 15, 16
+#CHECK-BE: xvrlw 34, 15, 16              # encoding: [0xf0,0x4f,0x85,0xc1]
+#CHECK-LE: xvrlw 34, 15, 16              # encoding: [0xc1,0x85,0x4f,0xf0]
+
            xxaes192encp 8, 10, 14
 #CHECK-BE: xxaes192encp 8, 10, 14         # encoding: [0xf1,0x0b,0x76,0x10]
 #CHECK-LE: xxaes192encp 8, 10, 14         # encoding: [0x10,0x76,0x0b,0xf1]

From d3caae1c07c297a5765d0498faf43f4730f71466 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 5 Nov 2025 10:24:56 -0800
Subject: [PATCH 41/61] [clang] Refactor clang's keyword enable/disable
 mechanism to allow lldb to re-use it (#165323)

lldb's CPlusPlusNameParser is currently identifying keywords using it's
own map implemented using clang/Basic/TokenKinds.def. However, it does
not respect the language options so identifiers can incorrectly
determined to be keywords when using languages in which they are not
keywords.

Rather than implement the logic to enable/disable keywords in both
projects it makes sense for lldb to use clang's implementation.

See #164284 for more information
---
 clang/include/clang/Basic/IdentifierTable.h | 51 +++++++++++++++++++
 clang/lib/Basic/IdentifierTable.cpp         | 55 +--------------------
 2 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index e4044bcdfcc60..b27492d19a65b 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -46,6 +46,57 @@ class LangOptions;
 class MultiKeywordSelector;
 class SourceLocation;
 
+/// Constants for TokenKinds.def
+enum TokenKey : unsigned {
+  KEYC99 = 0x1,
+  KEYCXX = 0x2,
+  KEYCXX11 = 0x4,
+  KEYGNU = 0x8,
+  KEYMS = 0x10,
+  BOOLSUPPORT = 0x20,
+  KEYALTIVEC = 0x40,
+  KEYNOCXX = 0x80,
+  KEYBORLAND = 0x100,
+  KEYOPENCLC = 0x200,
+  KEYC23 = 0x400,
+  KEYNOMS18 = 0x800,
+  KEYNOOPENCL = 0x1000,
+  WCHARSUPPORT = 0x2000,
+  HALFSUPPORT = 0x4000,
+  CHAR8SUPPORT = 0x8000,
+  KEYOBJC = 0x10000,
+  KEYZVECTOR = 0x20000,
+  KEYCOROUTINES = 0x40000,
+  KEYMODULES = 0x80000,
+  KEYCXX20 = 0x100000,
+  KEYOPENCLCXX = 0x200000,
+  KEYMSCOMPAT = 0x400000,
+  KEYSYCL = 0x800000,
+  KEYCUDA = 0x1000000,
+  KEYZOS = 0x2000000,
+  KEYNOZOS = 0x4000000,
+  KEYHLSL = 0x8000000,
+  KEYFIXEDPOINT = 0x10000000,
+  KEYMAX = KEYFIXEDPOINT, // The maximum key
+  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
+  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
+           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
+};
+
+/// How a keyword is treated in the selected standard. This enum is ordered
+/// intentionally so that the value that 'wins' is the most 'permissive'.
+enum KeywordStatus {
+  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
+  KS_Disabled,  // Disabled
+  KS_Future,    // Is a keyword in future standard
+  KS_Extension, // Is an extension
+  KS_Enabled,   // Enabled
+};
+
+/// Translates flags as specified in TokenKinds.def into keyword status
+/// in the given language standard.
+KeywordStatus getKeywordStatus(const LangOptions &LangOpts, unsigned Flags);
+
 enum class ReservedIdentifierStatus {
   NotReserved = 0,
   StartsWithUnderscoreAtGlobalScope,
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 4a2b77cd16bfc..d1c959b9687c4 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -77,57 +77,6 @@ IdentifierTable::IdentifierTable(const LangOptions &LangOpts,
 // Language Keyword Implementation
 //===----------------------------------------------------------------------===//
 
-// Constants for TokenKinds.def
-namespace {
-
-enum TokenKey : unsigned {
-  KEYC99 = 0x1,
-  KEYCXX = 0x2,
-  KEYCXX11 = 0x4,
-  KEYGNU = 0x8,
-  KEYMS = 0x10,
-  BOOLSUPPORT = 0x20,
-  KEYALTIVEC = 0x40,
-  KEYNOCXX = 0x80,
-  KEYBORLAND = 0x100,
-  KEYOPENCLC = 0x200,
-  KEYC23 = 0x400,
-  KEYNOMS18 = 0x800,
-  KEYNOOPENCL = 0x1000,
-  WCHARSUPPORT = 0x2000,
-  HALFSUPPORT = 0x4000,
-  CHAR8SUPPORT = 0x8000,
-  KEYOBJC = 0x10000,
-  KEYZVECTOR = 0x20000,
-  KEYCOROUTINES = 0x40000,
-  KEYMODULES = 0x80000,
-  KEYCXX20 = 0x100000,
-  KEYOPENCLCXX = 0x200000,
-  KEYMSCOMPAT = 0x400000,
-  KEYSYCL = 0x800000,
-  KEYCUDA = 0x1000000,
-  KEYZOS = 0x2000000,
-  KEYNOZOS = 0x4000000,
-  KEYHLSL = 0x8000000,
-  KEYFIXEDPOINT = 0x10000000,
-  KEYMAX = KEYFIXEDPOINT, // The maximum key
-  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
-  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
-           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
-};
-
-/// How a keyword is treated in the selected standard. This enum is ordered
-/// intentionally so that the value that 'wins' is the most 'permissive'.
-enum KeywordStatus {
-  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
-  KS_Disabled,  // Disabled
-  KS_Future,    // Is a keyword in future standard
-  KS_Extension, // Is an extension
-  KS_Enabled,   // Enabled
-};
-
-} // namespace
-
 // This works on a single TokenKey flag and checks the LangOpts to get the
 // KeywordStatus based exclusively on this flag, so that it can be merged in
 // getKeywordStatus. Most should be enabled/disabled, but some might imply
@@ -220,9 +169,7 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
   }
 }
 
-/// Translates flags as specified in TokenKinds.def into keyword status
-/// in the given language standard.
-static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
+KeywordStatus clang::getKeywordStatus(const LangOptions &LangOpts,
                                       unsigned Flags) {
   // KEYALL means always enabled, so special case this one.
   if (Flags == KEYALL) return KS_Enabled;

From bc55f4f4f2b4ef196cf3ec25f69dfbd9cd032237 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 5 Nov 2025 10:32:38 -0800
Subject: [PATCH 42/61] [debugserver] Fix debugserver build on < macOS 10.15
 (#166599)

The VM_MEMORY_SANITIZER constant was added in macOs 10.15 and friends.
Support using the constant on older OSes.

Fixes #156144
---
 lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
index 9d0d60fdaaed9..c8dce75af05eb 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
@@ -14,6 +14,12 @@
 #include "DNBLog.h"
 #include <cassert>
 #include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+
+// From <mach/vm_statistics.h>, but not on older OSs.
+#ifndef VM_MEMORY_SANITIZER
+#define VM_MEMORY_SANITIZER 99
+#endif
 
 MachVMRegion::MachVMRegion(task_t task)
     : m_task(task), m_addr(INVALID_NUB_ADDRESS), m_err(),

From c193eea86e9f0111e15df62343813857e306b779 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 18:39:01 +0000
Subject: [PATCH 43/61] [gn build] Port 056d2c12f756

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 444670212cafb..eb41df208941a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -88,6 +88,7 @@ static_library("CodeGen") {
     "LatencyPriorityQueue.cpp",
     "LazyMachineBlockFrequencyInfo.cpp",
     "LexicalScopes.cpp",
+    "LibcallLoweringInfo.cpp",
     "LiveDebugValues/InstrRefBasedImpl.cpp",
     "LiveDebugValues/LiveDebugValues.cpp",
     "LiveDebugValues/VarLocBasedImpl.cpp",

From 120689e46679c6db37cd9e839ec0721e80a22d4f Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 5 Nov 2025 11:02:28 -0800
Subject: [PATCH 44/61] [libc] Migrate ctype_utils to use char instead of int
 where applicable. (#166225)

Functions like isalpha / tolower can operate on chars internally. This
allows us to get rid of unnecessary casts and open a way to creating
wchar_t overloads with the same names (e.g. for isalpha), that would
simplify templated code for conversion functions (see
315dfe5865962d8a3d60e21d1fffce5214fe54ef).

Add the int->char converstion to public entrypoints implementation
instead. We also need to introduce bounds check on the input argument
values - these functions' behavior is unspecified if the argument is
neither EOF nor fits in "unsigned char" range, but the tests we've had
verified that they always return false for small negative values. To
preserve this behavior, cover it explicitly.
---
 libc/src/__support/ctype_utils.h              | 24 +++++++++----------
 libc/src/__support/integer_to_string.h        |  5 ++--
 libc/src/ctype/CMakeLists.txt                 | 23 ++++++++++++++++++
 libc/src/ctype/isalnum.cpp                    |  7 ++++--
 libc/src/ctype/isalnum_l.cpp                  |  7 ++++--
 libc/src/ctype/isalpha.cpp                    |  5 +++-
 libc/src/ctype/isalpha_l.cpp                  |  5 +++-
 libc/src/ctype/isdigit.cpp                    |  6 ++++-
 libc/src/ctype/isdigit_l.cpp                  |  6 ++++-
 libc/src/ctype/isgraph.cpp                    |  5 +++-
 libc/src/ctype/isgraph_l.cpp                  |  5 +++-
 libc/src/ctype/islower.cpp                    |  7 ++++--
 libc/src/ctype/islower_l.cpp                  |  7 ++++--
 libc/src/ctype/ispunct.cpp                    |  5 +++-
 libc/src/ctype/ispunct_l.cpp                  |  5 +++-
 libc/src/ctype/isspace.cpp                    |  7 ++++--
 libc/src/ctype/isspace_l.cpp                  |  7 ++++--
 libc/src/ctype/isupper.cpp                    |  7 ++++--
 libc/src/ctype/isupper_l.cpp                  |  7 ++++--
 libc/src/ctype/isxdigit.cpp                   |  7 ++++--
 libc/src/ctype/isxdigit_l.cpp                 |  7 ++++--
 libc/src/ctype/tolower.cpp                    | 11 +++++++--
 libc/src/ctype/tolower_l.cpp                  |  9 +++++--
 libc/src/ctype/toupper.cpp                    | 11 +++++++--
 libc/src/ctype/toupper_l.cpp                  |  9 +++++--
 .../printf_core/float_dec_converter_limited.h |  4 ++--
 .../stdio/printf_core/float_hex_converter.h   |  7 +++---
 libc/src/stdlib/l64a.cpp                      |  6 ++---
 libc/src/string/strcasestr.cpp                |  4 ++--
 libc/src/strings/strcasecmp.cpp               |  4 ++--
 libc/src/strings/strcasecmp_l.cpp             |  4 ++--
 libc/src/strings/strncasecmp.cpp              |  4 ++--
 libc/src/strings/strncasecmp_l.cpp            |  4 ++--
 libc/test/UnitTest/MemoryMatcher.cpp          |  4 ++--
 libc/test/src/ctype/islower_test.cpp          |  2 +-
 libc/test/src/stdlib/StrtolTest.h             | 24 +++++++++----------
 .../llvm-project-overlay/libc/BUILD.bazel     | 11 +++++++++
 37 files changed, 196 insertions(+), 86 deletions(-)

diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index be0f25330af9e..61b7a0aeb5b67 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -27,7 +27,7 @@ namespace internal {
 // as well as a way to support non-ASCII character encodings.
 
 // Similarly, do not change these functions to use case ranges. e.g.
-//  bool islower(int ch) {
+//  bool islower(char ch) {
 //    switch(ch) {
 //    case 'a'...'z':
 //      return true;
@@ -37,7 +37,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool islower(int ch) {
+LIBC_INLINE static constexpr bool islower(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -71,7 +71,7 @@ LIBC_INLINE static constexpr bool islower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isupper(int ch) {
+LIBC_INLINE static constexpr bool isupper(char ch) {
   switch (ch) {
   case 'A':
   case 'B':
@@ -105,7 +105,7 @@ LIBC_INLINE static constexpr bool isupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isdigit(int ch) {
+LIBC_INLINE static constexpr bool isdigit(char ch) {
   switch (ch) {
   case '0':
   case '1':
@@ -123,7 +123,7 @@ LIBC_INLINE static constexpr bool isdigit(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int tolower(int ch) {
+LIBC_INLINE static constexpr char tolower(char ch) {
   switch (ch) {
   case 'A':
     return 'a';
@@ -182,7 +182,7 @@ LIBC_INLINE static constexpr int tolower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int toupper(int ch) {
+LIBC_INLINE static constexpr char toupper(char ch) {
   switch (ch) {
   case 'a':
     return 'A';
@@ -241,7 +241,7 @@ LIBC_INLINE static constexpr int toupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalpha(int ch) {
+LIBC_INLINE static constexpr bool isalpha(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -301,7 +301,7 @@ LIBC_INLINE static constexpr bool isalpha(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalnum(int ch) {
+LIBC_INLINE static constexpr bool isalnum(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -371,7 +371,7 @@ LIBC_INLINE static constexpr bool isalnum(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
+LIBC_INLINE static constexpr int b36_char_to_int(char ch) {
   switch (ch) {
   case '0':
     return 0;
@@ -476,7 +476,7 @@ LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int int_to_b36_char(int num) {
+LIBC_INLINE static constexpr char int_to_b36_char(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -559,7 +559,7 @@ LIBC_INLINE static constexpr int int_to_b36_char(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool isspace(int ch) {
+LIBC_INLINE static constexpr bool isspace(char ch) {
   switch (ch) {
   case ' ':
   case '\t':
@@ -574,7 +574,7 @@ LIBC_INLINE static constexpr bool isspace(int ch) {
 }
 
 // not yet encoding independent.
-LIBC_INLINE static constexpr bool isgraph(int ch) {
+LIBC_INLINE static constexpr bool isgraph(char ch) {
   return 0x20 < ch && ch < 0x7f;
 }
 
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index 29449bd739730..5e7369de00962 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -378,9 +378,8 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
     using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
-      const int result = internal::int_to_b36_char(digit);
-      return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
-                                                 : result);
+      const char result = internal::int_to_b36_char(digit);
+      return Fmt::IS_UPPERCASE ? internal::toupper(result) : result;
     }
 
     LIBC_INLINE static void
diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt
index 8830c1bccf9ea..68e982bd4529e 100644
--- a/libc/src/ctype/CMakeLists.txt
+++ b/libc/src/ctype/CMakeLists.txt
@@ -6,6 +6,7 @@ add_entrypoint_object(
     isalnum.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -16,6 +17,7 @@ add_entrypoint_object(
   HDRS
     isalpha.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -50,6 +52,7 @@ add_entrypoint_object(
   HDRS
     isdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -60,6 +63,7 @@ add_entrypoint_object(
   HDRS
     isgraph.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -70,6 +74,7 @@ add_entrypoint_object(
   HDRS
     islower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -88,6 +93,7 @@ add_entrypoint_object(
   HDRS
     ispunct.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -97,6 +103,9 @@ add_entrypoint_object(
     isspace.cpp
   HDRS
     isspace.h
+  DEPENDS
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -106,6 +115,7 @@ add_entrypoint_object(
   HDRS
     isupper.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
 )
 
@@ -116,6 +126,7 @@ add_entrypoint_object(
   HDRS
     isxdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -126,6 +137,7 @@ add_entrypoint_object(
   HDRS
     tolower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -144,6 +156,7 @@ add_entrypoint_object(
   HDRS
     toupper.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -160,6 +173,7 @@ add_entrypoint_object(
     isalnum_l.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -171,6 +185,7 @@ add_entrypoint_object(
   HDRS
     isalpha_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -202,6 +217,7 @@ add_entrypoint_object(
   HDRS
     isdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -224,6 +240,7 @@ add_entrypoint_object(
   HDRS
     islower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -257,6 +274,8 @@ add_entrypoint_object(
     isspace_l.h
   DEPENDS
     libc.hdr.types.locale_t
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -266,6 +285,7 @@ add_entrypoint_object(
   HDRS
     isupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -277,6 +297,7 @@ add_entrypoint_object(
   HDRS
     isxdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -288,6 +309,7 @@ add_entrypoint_object(
   HDRS
     tolower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -299,6 +321,7 @@ add_entrypoint_object(
   HDRS
     toupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp
index 54a3e35748879..102b5e79e4a18 100644
--- a/libc/src/ctype/isalnum.cpp
+++ b/libc/src/ctype/isalnum.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum, (int c)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp
index 671d9b75c4c33..173e1c174121e 100644
--- a/libc/src/ctype/isalnum_l.cpp
+++ b/libc/src/ctype/isalnum_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp
index 78b26f6a486ea..7c874bf373866 100644
--- a/libc/src/ctype/isalpha.cpp
+++ b/libc/src/ctype/isalpha.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha, (int c)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp
index 0619d979bedf2..982bcc569faaf 100644
--- a/libc/src/ctype/isalpha_l.cpp
+++ b/libc/src/ctype/isalpha_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp
index 1f711943861f8..43553c794a2f3 100644
--- a/libc/src/ctype/isdigit.cpp
+++ b/libc/src/ctype/isdigit.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit, (int c)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp
index ca981362bfe83..40b5618906dac 100644
--- a/libc/src/ctype/isdigit_l.cpp
+++ b/libc/src/ctype/isdigit_l.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit_l.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp
index 74bb2e75d138e..b9308ecb7367c 100644
--- a/libc/src/ctype/isgraph.cpp
+++ b/libc/src/ctype/isgraph.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph, (int c)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp
index cbef6df148aed..dddcb9be4f80c 100644
--- a/libc/src/ctype/isgraph_l.cpp
+++ b/libc/src/ctype/isgraph_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp
index 831aad32d3a22..920bfc1cc1a59 100644
--- a/libc/src/ctype/islower.cpp
+++ b/libc/src/ctype/islower.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower, (int c)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp
index b9be6acc81c99..da97026dc59a7 100644
--- a/libc/src/ctype/islower_l.cpp
+++ b/libc/src/ctype/islower_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp
index 0635294220b9c..4950036e9b81f 100644
--- a/libc/src/ctype/ispunct.cpp
+++ b/libc/src/ctype/ispunct.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp
index e825fbe2001b0..79cd47b6a214d 100644
--- a/libc/src/ctype/ispunct_l.cpp
+++ b/libc/src/ctype/ispunct_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp
index 005bf460fc103..998dbf28f51d0 100644
--- a/libc/src/ctype/isspace.cpp
+++ b/libc/src/ctype/isspace.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace, (int c)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp
index 5c46dd6805126..e40765326b35e 100644
--- a/libc/src/ctype/isspace_l.cpp
+++ b/libc/src/ctype/isspace_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp
index 965fa336b28b4..c5c3dbd5d7d4a 100644
--- a/libc/src/ctype/isupper.cpp
+++ b/libc/src/ctype/isupper.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper, (int c)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp
index 358990261d603..44ed9dab90a16 100644
--- a/libc/src/ctype/isupper_l.cpp
+++ b/libc/src/ctype/isupper_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp
index 81f645c6f49fc..1b2e71769b3f8 100644
--- a/libc/src/ctype/isxdigit.cpp
+++ b/libc/src/ctype/isxdigit.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp
index eddfd20a2da3b..e6150473b0043 100644
--- a/libc/src/ctype/isxdigit_l.cpp
+++ b/libc/src/ctype/isxdigit_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp
index 3ecad7bc5d5d5..b45c5f2688a61 100644
--- a/libc/src/ctype/tolower.cpp
+++ b/libc/src/ctype/tolower.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); }
+LLVM_LIBC_FUNCTION(int, tolower, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp
index 7ccf31617e592..049e46aea13c0 100644
--- a/libc/src/ctype/tolower_l.cpp
+++ b/libc/src/ctype/tolower_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) {
-  return internal::tolower(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp
index 1e1e8fc400711..0e387238ce3b6 100644
--- a/libc/src/ctype/toupper.cpp
+++ b/libc/src/ctype/toupper.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); }
+LLVM_LIBC_FUNCTION(int, toupper, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp
index a435ca1ab5d41..d1dff262c9377 100644
--- a/libc/src/ctype/toupper_l.cpp
+++ b/libc/src/ctype/toupper_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) {
-  return internal::toupper(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h
index 9cdc13573d320..0f85d0a8d26b4 100644
--- a/libc/src/stdio/printf_core/float_dec_converter_limited.h
+++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h
@@ -363,8 +363,8 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) {
       // we made it from and doing the decimal conversion all over again.)
       for (size_t i = output.ndigits; i-- > 0;) {
         if (output.digits[i] != '9') {
-          output.digits[i] = static_cast<char>(internal::int_to_b36_char(
-              internal::b36_char_to_int(output.digits[i]) + 1));
+          output.digits[i] = internal::int_to_b36_char(
+              internal::b36_char_to_int(output.digits[i]) + 1);
           break;
         } else {
           output.digits[i] = '0';
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index 16592e7bac932..9b57f1d803e74 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -137,9 +137,9 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
   size_t first_non_zero = 1;
   for (; mant_cur > 0; --mant_cur, mantissa >>= 4) {
     char mant_mod_16 = static_cast<char>(mantissa % 16);
-    char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16));
+    char new_digit = internal::int_to_b36_char(mant_mod_16);
     if (internal::isupper(to_conv.conv_name))
-      new_digit = static_cast<char>(internal::toupper(new_digit));
+      new_digit = internal::toupper(new_digit);
     mant_buffer[mant_cur - 1] = new_digit;
     if (new_digit != '0' && first_non_zero < mant_cur)
       first_non_zero = mant_cur;
@@ -167,8 +167,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
 
   size_t exp_cur = EXP_LEN;
   for (; exponent > 0; --exp_cur, exponent /= 10) {
-    exp_buffer[exp_cur - 1] =
-        static_cast<char>(internal::int_to_b36_char(exponent % 10));
+    exp_buffer[exp_cur - 1] = internal::int_to_b36_char(exponent % 10);
   }
   if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0.
     exp_buffer[EXP_LEN - 1] = '0';
diff --git a/libc/src/stdlib/l64a.cpp b/libc/src/stdlib/l64a.cpp
index d59e65e7dc4c2..d8fe8ef86bf7d 100644
--- a/libc/src/stdlib/l64a.cpp
+++ b/libc/src/stdlib/l64a.cpp
@@ -32,15 +32,13 @@ constexpr static char b64_int_to_char(uint32_t num) {
   if (num == 1)
     return '/';
   if (num < 38)
-    return static_cast<char>(
-        internal::toupper(internal::int_to_b36_char(num - 2)));
+    return internal::toupper(internal::int_to_b36_char(num - 2));
 
   // this tolower is technically unnecessary, but it provides safety if we
   // change the default behavior of int_to_b36_char. Also the compiler
   // completely elides it so there's no performance penalty, see:
   // https://godbolt.org/z/o5ennv7fc
-  return static_cast<char>(
-      internal::tolower(internal::int_to_b36_char(num - 2 - 26)));
+  return internal::tolower(internal::int_to_b36_char(num - 2 - 26));
 }
 
 // This function takes a long and converts the low 32 bits of it into at most 6
diff --git a/libc/src/string/strcasestr.cpp b/libc/src/string/strcasestr.cpp
index de8e4bec7fe0b..575d6bed16d11 100644
--- a/libc/src/string/strcasestr.cpp
+++ b/libc/src/string/strcasestr.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, strcasestr,
                    (const char *haystack, const char *needle)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
 
   LIBC_CRASH_ON_NULLPTR(haystack);
diff --git a/libc/src/strings/strcasecmp.cpp b/libc/src/strings/strcasecmp.cpp
index 4bbe2909df1e2..4518647deabe4 100644
--- a/libc/src/strings/strcasecmp.cpp
+++ b/libc/src/strings/strcasecmp.cpp
@@ -17,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, strcasecmp, (const char *left, const char *right)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strcasecmp_l.cpp b/libc/src/strings/strcasecmp_l.cpp
index 95117cb27a564..d77f95637a396 100644
--- a/libc/src/strings/strcasecmp_l.cpp
+++ b/libc/src/strings/strcasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strcasecmp_l,
                    (const char *left, const char *right, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp.cpp b/libc/src/strings/strncasecmp.cpp
index 9c2f0ab131269..a5926495a3e22 100644
--- a/libc/src/strings/strncasecmp.cpp
+++ b/libc/src/strings/strncasecmp.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp,
                    (const char *left, const char *right, size_t n)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp_l.cpp b/libc/src/strings/strncasecmp_l.cpp
index 91ac7e5e89107..a828f609fd9e8 100644
--- a/libc/src/strings/strncasecmp_l.cpp
+++ b/libc/src/strings/strncasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp_l,
                    (const char *left, const char *right, size_t n, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/test/UnitTest/MemoryMatcher.cpp b/libc/test/UnitTest/MemoryMatcher.cpp
index 6e375768e9333..405f226798f7a 100644
--- a/libc/test/UnitTest/MemoryMatcher.cpp
+++ b/libc/test/UnitTest/MemoryMatcher.cpp
@@ -41,8 +41,8 @@ bool MemoryMatcher::match(MemoryView actualValue) {
 
 static void display(char C) {
   const auto print = [](unsigned char i) {
-    tlog << static_cast<char>(LIBC_NAMESPACE::internal::toupper(
-        LIBC_NAMESPACE::internal::int_to_b36_char(i)));
+    tlog << LIBC_NAMESPACE::internal::toupper(
+        LIBC_NAMESPACE::internal::int_to_b36_char(i));
   };
   print(static_cast<unsigned char>(C) / 16);
   print(static_cast<unsigned char>(C) & 15);
diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp
index f877171abb9a3..e4e5f5cefd954 100644
--- a/libc/test/src/ctype/islower_test.cpp
+++ b/libc/test/src/ctype/islower_test.cpp
@@ -40,7 +40,7 @@ TEST(LlvmLibcIsLower, SimpleTest) {
 }
 
 TEST(LlvmLibcIsLower, DefaultLocale) {
-  // Loops through all characters, verifying that numbers and letters
+  // Loops through all characters, verifying that only lowercase letters
   // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
     if (in_span(ch, LOWER_ARRAY))
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3a7da1fa85ac7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -177,8 +177,8 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     char small_string[4] = {'\0', '\0', '\0', '\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -192,11 +192,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -216,14 +216,14 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<char>(
-                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8d225d63cdf3e..9c14a7b991487 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1859,6 +1859,7 @@ libc_function(
     hdrs = ["src/ctype/isalnum.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1869,6 +1870,7 @@ libc_function(
     hdrs = ["src/ctype/isalpha.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1909,6 +1911,7 @@ libc_function(
     hdrs = ["src/ctype/isdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1919,6 +1922,7 @@ libc_function(
     hdrs = ["src/ctype/isgraph.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1929,6 +1933,7 @@ libc_function(
     hdrs = ["src/ctype/islower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1949,6 +1954,7 @@ libc_function(
     hdrs = ["src/ctype/ispunct.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1959,6 +1965,7 @@ libc_function(
     hdrs = ["src/ctype/isspace.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1969,6 +1976,7 @@ libc_function(
     hdrs = ["src/ctype/isupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1979,6 +1987,7 @@ libc_function(
     hdrs = ["src/ctype/isxdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1999,6 +2008,7 @@ libc_function(
     hdrs = ["src/ctype/tolower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -2009,6 +2019,7 @@ libc_function(
     hdrs = ["src/ctype/toupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )

From e7f7973899f76773ae6e9a6b1e8c7e9f9cc5cb56 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 5 Nov 2025 11:03:25 -0800
Subject: [PATCH 45/61] [libc] Migrate wctype_utils to use wchar_t where
 applicable. (#166234)

This is a counterpart of
https://github.com/llvm/llvm-project/pull/166225 but for wctype_utils
(which are not yet widely used). For now, I'm just changing the types
from wint_t to wchar_t to match the regular ctype_utils change. The next
change may rename most of the functions to match the name of ctype_utils
variants, so that we could be calling them from the templated code
operating on "const char*" and "const wchar_t*" strings, and the right
function signature would be picked up.
---
 libc/src/__support/CMakeLists.txt             |  1 +
 libc/src/__support/wctype_utils.h             | 23 +++++++++---------
 libc/src/wctype/iswalpha.cpp                  |  4 +++-
 libc/test/src/wchar/WcstolTest.h              | 24 +++++++++----------
 .../llvm-project-overlay/libc/BUILD.bazel     |  1 +
 5 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index b7af751ec3f27..96874702b1fdf 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -161,6 +161,7 @@ add_header_library(
   HDRS
     wctype_utils.h
   DEPENDS
+    libc.hdr.types.wchar_t
     libc.hdr.types.wint_t
 )
 
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 2ae5ec93b2a63..60b6afb928475 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 
+#include "hdr/types/wchar_t.h"
 #include "hdr/types/wint_t.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
@@ -30,7 +31,7 @@ namespace internal {
 
 // Similarly, do not change these fumarks to show your new solution is faster,
 // as well as a way to support non-Anctions to use case ranges. e.g.
-//  bool iswlower(wint_t ch) {
+//  bool iswlower(wchar_t ch) {
 //    switch(ch) {
 //    case L'a'...L'z':
 //      return true;
@@ -40,7 +41,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
+LIBC_INLINE static constexpr bool iswlower(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -74,7 +75,7 @@ LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
+LIBC_INLINE static constexpr bool iswupper(wchar_t wch) {
   switch (wch) {
   case L'A':
   case L'B':
@@ -108,7 +109,7 @@ LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
+LIBC_INLINE static constexpr bool iswdigit(wchar_t wch) {
   switch (wch) {
   case L'0':
   case L'1':
@@ -126,7 +127,7 @@ LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t towlower(wchar_t wch) {
   switch (wch) {
   case L'A':
     return L'a';
@@ -185,7 +186,7 @@ LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t towupper(wchar_t wch) {
   switch (wch) {
   case L'a':
     return L'A';
@@ -244,7 +245,7 @@ LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
+LIBC_INLINE static constexpr bool iswalpha(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -304,7 +305,7 @@ LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
+LIBC_INLINE static constexpr bool iswalnum(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -374,7 +375,7 @@ LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
+LIBC_INLINE static constexpr int b36_wchar_to_int(wchar_t wch) {
   switch (wch) {
   case L'0':
     return 0;
@@ -479,7 +480,7 @@ LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
+LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -562,7 +563,7 @@ LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswspace(wint_t wch) {
+LIBC_INLINE static constexpr bool iswspace(wchar_t wch) {
   switch (wch) {
   case L' ':
   case L'\t':
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
index 09f55d391dbff..e151363b88d0b 100644
--- a/libc/src/wctype/iswalpha.cpp
+++ b/libc/src/wctype/iswalpha.cpp
@@ -14,6 +14,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) { return internal::iswalpha(c); }
+LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) {
+  return internal::iswalpha(static_cast<wchar_t>(c));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/wchar/WcstolTest.h b/libc/test/src/wchar/WcstolTest.h
index 4d5b752e62238..cadf9e0c42b90 100644
--- a/libc/test/src/wchar/WcstolTest.h
+++ b/libc/test/src/wchar/WcstolTest.h
@@ -178,8 +178,8 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     wchar_t small_string[4] = {L'\0', L'\0', L'\0', L'\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -193,11 +193,11 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -217,14 +217,14 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<wchar_t>(
-                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9c14a7b991487..b65fe64acdea0 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1805,6 +1805,7 @@ libc_support_library(
         ":__support_cpp_optional",
         ":__support_macros_attributes",
         ":__support_macros_config",
+        ":types_wchar_t",
         ":types_wint_t",
     ],
 )

From 37fff6e17ee29e790f850f6e133d14a73c08a0f8 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 5 Nov 2025 11:06:22 -0800
Subject: [PATCH 46/61] [NFC][LLVM][IR] Cleanup namespace usage in LLVM IR cpp
 files (#166477)

---
 llvm/lib/IR/AsmWriter.cpp               |  6 +-----
 llvm/lib/IR/DebugLoc.cpp                |  6 ++----
 llvm/lib/IR/DebugProgramInstruction.cpp |  6 +++---
 llvm/lib/IR/FPEnv.cpp                   | 16 ++++++++--------
 llvm/lib/IR/Operator.cpp                |  4 ++--
 llvm/lib/IR/PassTimingInfo.cpp          | 12 +++++-------
 llvm/lib/IR/PseudoProbe.cpp             | 12 ++++--------
 llvm/lib/IR/ReplaceConstant.cpp         | 12 +++++-------
 llvm/lib/IR/Use.cpp                     |  4 +---
 llvm/lib/IR/User.cpp                    |  5 +++--
 llvm/lib/IR/Verifier.cpp                |  6 +-----
 11 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 95d954f6b8174..0c8565c927a24 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
 
 AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default;
 
-namespace llvm {
-
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
 /// This class provides computation of slot numbers for LLVM Assembly writing.
 ///
-class SlotTracker : public AbstractSlotTrackerStorage {
+class llvm::SlotTracker : public AbstractSlotTrackerStorage {
 public:
   /// ValueMap - A mapping of Values to slot numbers.
   using ValueMap = DenseMap<const Value *, unsigned>;
@@ -943,8 +941,6 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   void processDbgRecordMetadata(const DbgRecord &DVR);
 };
 
-} // end namespace llvm
-
 ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
                                      const Function *F)
     : M(M), F(F), Machine(&Machine) {}
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 01dafcab94ce9..bfba6e0cab6bf 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -10,10 +10,11 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 
+using namespace llvm;
+
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 #include "llvm/Support/Signals.h"
 
-namespace llvm {
 DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) {
   if (!ShouldCollectTrace)
     return;
@@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() {
   auto &[Depth, StackTrace] = StackTraces.emplace_back();
   Depth = sys::getStackTrace(StackTrace);
 }
-} // namespace llvm
 #endif
 
-using namespace llvm;
-
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L),
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index d9357bba75510..6b1fd3907dc41 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -12,8 +12,9 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Compiler.h"
 
-namespace llvm {
+using namespace llvm;
 
+namespace llvm {
 template <typename T>
 DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param)
     : Ref(const_cast<T *>(Param)) {}
@@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const {
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>;
+} // namespace llvm
 
 DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
     : DbgRecord(ValueKind, DVI->getDebugLoc()),
@@ -755,5 +757,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom(
     // We inserted a block at the end, return that range.
     return {First->getIterator(), StoredDbgRecords.end()};
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 67f21d3756e93..c41d7b3181a37 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -19,9 +19,10 @@
 #include "llvm/IR/Intrinsics.h"
 #include <optional>
 
-namespace llvm {
+using namespace llvm;
 
-std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
+std::optional<RoundingMode>
+llvm::convertStrToRoundingMode(StringRef RoundingArg) {
   // For dynamic rounding mode, we use round to nearest but we will set the
   // 'exact' SDNodeFlag so that the value will not be rounded.
   return StringSwitch<std::optional<RoundingMode>>(RoundingArg)
@@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
       .Default(std::nullopt);
 }
 
-std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
+std::optional<StringRef>
+llvm::convertRoundingModeToStr(RoundingMode UseRounding) {
   std::optional<StringRef> RoundingStr;
   switch (UseRounding) {
   case RoundingMode::Dynamic:
@@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
 }
 
 std::optional<fp::ExceptionBehavior>
-convertStrToExceptionBehavior(StringRef ExceptionArg) {
+llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) {
   return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg)
       .Case("fpexcept.ignore", fp::ebIgnore)
       .Case("fpexcept.maytrap", fp::ebMayTrap)
@@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) {
 }
 
 std::optional<StringRef>
-convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
+llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   std::optional<StringRef> ExceptStr;
   switch (UseExcept) {
   case fp::ebStrict:
@@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   return ExceptStr;
 }
 
-Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
+Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) {
   Intrinsic::ID IID = Intrinsic::not_intrinsic;
   switch (Instr.getOpcode()) {
   case Instruction::FCmp:
@@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
 
   return IID;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 39e5463cb6fc3..c3e54a0fc0c7e 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -17,7 +17,8 @@
 
 #include "ConstantsContext.h"
 
-namespace llvm {
+using namespace llvm;
+
 bool Operator::hasPoisonGeneratingFlags() const {
   switch (getOpcode()) {
   case Instruction::Add:
@@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const {
       O << " afn";
   }
 }
-} // namespace llvm
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 4e27086e97ac5..cb1b91a98b036 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -32,10 +32,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "time-passes"
 
-namespace llvm {
+using namespace llvm;
 
-bool TimePassesIsEnabled = false;
-bool TimePassesPerRun = false;
+bool llvm::TimePassesIsEnabled = false;
+bool llvm::TimePassesPerRun = false;
 
 static cl::opt<bool, true> EnableTiming(
     "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden,
@@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo;
 } // namespace legacy
 } // namespace
 
-Timer *getPassTimer(Pass *P) {
+Timer *llvm::getPassTimer(Pass *P) {
   legacy::PassTimingInfo::init();
   if (legacy::PassTimingInfo::TheTimeInfo)
     return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P);
@@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) {
 
 /// If timing is enabled, report the times collected up to now and then reset
 /// them.
-void reportAndResetTimings(raw_ostream *OutStream) {
+void llvm::reportAndResetTimings(raw_ostream *OutStream) {
   if (legacy::PassTimingInfo::TheTimeInfo)
     legacy::PassTimingInfo::TheTimeInfo->print(OutStream);
 }
@@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   PIC.registerAfterAnalysisCallback(
       [this](StringRef P, Any) { this->stopAnalysisTimer(P); });
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index 59f218cc3683b..3c05f4b1f86a2 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -19,9 +19,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const DILocation *DIL) {
   if (DIL) {
     auto Discriminator = DIL->getDiscriminator();
@@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const Instruction &Inst) {
   assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) &&
          "Only call instructions should have pseudo probe encodes as their "
@@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
+std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) {
   if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
     PseudoProbe Probe;
     Probe.Id = II->getIndex()->getZExtValue();
@@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
   return std::nullopt;
 }
 
-void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) {
   assert(Factor >= 0 && Factor <= 1 &&
          "Distribution factor must be in [0, 1.0]");
   if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
@@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) {
     }
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 962368f061851..b3586b45a23f2 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -16,7 +16,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 
-namespace llvm {
+using namespace llvm;
 
 static bool isExpandableUser(User *U) {
   return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
@@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
   return NewInsts;
 }
 
-bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
-                                           Function *RestrictToFunc,
-                                           bool RemoveDeadConstants,
-                                           bool IncludeSelf) {
+bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
+                                                 Function *RestrictToFunc,
+                                                 bool RemoveDeadConstants,
+                                                 bool IncludeSelf) {
   // Find all expandable direct users of Consts.
   SmallVector<Constant *> Stack;
   for (Constant *C : Consts) {
@@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
 
   return Changed;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 67882ba0144b4..504233575594d 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -9,7 +9,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 
-namespace llvm {
+using namespace llvm;
 
 void Use::swap(Use &RHS) {
   if (Val == RHS.Val)
@@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) {
   if (del)
     ::operator delete(Start);
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index ab44cb4b8a3f7..9bb7c1298593a 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -11,8 +11,11 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IntrinsicInst.h"
 
+using namespace llvm;
+
 namespace llvm {
 class BasicBlock;
+}
 
 //===----------------------------------------------------------------------===//
 //                                 User Class
@@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) {
     ::operator delete(Storage);
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7917712846990..24f90bf6de7f5 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination(
     cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical "
              "scopes are not dominating"));
 
-namespace llvm {
-
-struct VerifierSupport {
+struct llvm::VerifierSupport {
   raw_ostream *OS;
   const Module &M;
   ModuleSlotTracker MST;
@@ -318,8 +316,6 @@ struct VerifierSupport {
   }
 };
 
-} // namespace llvm
-
 namespace {
 
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {

From 00171b352def8afa314c89a090501e890326fb34 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 5 Nov 2025 11:06:57 -0800
Subject: [PATCH 47/61] [NFC][TableGen] Adopt CodeGenHelpers in
 SDNodeInfoEmitter (#165622)

Use `IfDefEmitter` and `NamespaceEmitter` in SDNodeInfoEmitter.
---
 .../TableGen/SDNodeInfoEmitter/no-nodes.td    |  1 +
 llvm/utils/TableGen/SDNodeInfoEmitter.cpp     | 23 +++++++------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
index 0c5c63db4c95b..cc0f87755cdc2 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
@@ -20,6 +20,7 @@ def MyTarget : Target;
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
 // CHECK-EMPTY:
+// CHECK-EMPTY:
 // CHECK-NEXT:  #ifdef __GNUC__
 // CHECK-NEXT:  #pragma GCC diagnostic push
 // CHECK-NEXT:  #pragma GCC diagnostic ignored "-Woverlength-strings"
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index 64f03dae83e7d..dd18d29e6c676 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -10,6 +10,7 @@
 #include "Common/CodeGenDAGPatterns.h" // For SDNodeInfo.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -129,9 +130,8 @@ SDNodeInfoEmitter::SDNodeInfoEmitter(const RecordKeeper &RK)
 }
 
 void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
-  OS << "#ifdef GET_SDNODE_ENUM\n";
-  OS << "#undef GET_SDNODE_ENUM\n\n";
-  OS << "namespace llvm::" << TargetSDNodeNamespace << " {\n\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_ENUM");
+  NamespaceEmitter NS(OS, "llvm::" + TargetSDNodeNamespace);
 
   if (!NodesByName.empty()) {
     StringRef FirstName = NodesByName.begin()->first;
@@ -145,14 +145,11 @@ void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
 
     OS << "};\n\n";
     OS << "static constexpr unsigned GENERATED_OPCODE_END = " << LastName
-       << " + 1;\n\n";
+       << " + 1;\n";
   } else {
     OS << "static constexpr unsigned GENERATED_OPCODE_END = "
-          "ISD::BUILTIN_OP_END;\n\n";
+          "ISD::BUILTIN_OP_END;\n";
   }
-
-  OS << "} // namespace llvm::" << TargetSDNodeNamespace << "\n\n";
-  OS << "#endif // GET_SDNODE_ENUM\n\n";
 }
 
 std::vector<unsigned> SDNodeInfoEmitter::emitNodeNames(raw_ostream &OS) const {
@@ -324,9 +321,8 @@ static void emitDesc(raw_ostream &OS, StringRef EnumName,
 void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
   StringRef TargetName = Target.getName();
 
-  OS << "#ifdef GET_SDNODE_DESC\n";
-  OS << "#undef GET_SDNODE_DESC\n\n";
-  OS << "namespace llvm {\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_DESC");
+  NamespaceEmitter NS(OS, "llvm");
 
   std::vector<unsigned> NameOffsets = emitNodeNames(OS);
   std::vector<std::pair<unsigned, unsigned>> ConstraintOffsetsAndCounts =
@@ -343,11 +339,8 @@ void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
 
   OS << formatv("static const SDNodeInfo {0}GenSDNodeInfo(\n"
                 "    /*NumOpcodes=*/{1}, {0}SDNodeDescs,\n"
-                "    {0}SDNodeNames, {0}SDTypeConstraints);\n\n",
+                "    {0}SDNodeNames, {0}SDTypeConstraints);\n",
                 TargetName, NodesByName.size());
-
-  OS << "} // namespace llvm\n\n";
-  OS << "#endif // GET_SDNODE_DESC\n\n";
 }
 
 void SDNodeInfoEmitter::run(raw_ostream &OS) const {

From 28a279ce14f913df71546d8201d5363682a75901 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Wed, 5 Nov 2025 19:07:34 +0000
Subject: [PATCH 48/61] [lldb-dap] expand tilde in dap executable path
 (#162635)

Users may have multiple devices and would like to resolve the homepath
based on the machine they are on.

expands the tilde `~` character at the front of the given file path.
---
 .../lldb-dap/src-ts/debug-adapter-factory.ts  |  6 ++-
 lldb/tools/lldb-dap/src-ts/utils.ts           | 41 +++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 lldb/tools/lldb-dap/src-ts/utils.ts

diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
index 7060638a94864..433d48fab9d85 100644
--- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
+++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
@@ -6,6 +6,7 @@ import * as fs from "node:fs/promises";
 import { ConfigureButton, OpenSettingsButton } from "./ui/show-error-message";
 import { ErrorWithNotification } from "./ui/error-with-notification";
 import { LogFilePathProvider, LogType } from "./logging";
+import { expandUser } from "./utils";
 
 const exec = util.promisify(child_process.execFile);
 
@@ -116,8 +117,9 @@ async function getDAPExecutable(
   configuration: vscode.DebugConfiguration,
 ): Promise<string> {
   // Check if the executable was provided in the launch configuration.
-  const launchConfigPath = configuration["debugAdapterExecutable"];
+  let launchConfigPath = configuration["debugAdapterExecutable"];
   if (typeof launchConfigPath === "string" && launchConfigPath.length !== 0) {
+    launchConfigPath = expandUser(launchConfigPath);
     if (!(await isExecutable(launchConfigPath))) {
       throw new ErrorWithNotification(
         `Debug adapter path "${launchConfigPath}" is not a valid file. The path comes from your launch configuration.`,
@@ -129,7 +131,7 @@ async function getDAPExecutable(
 
   // Check if the executable was provided in the extension's configuration.
   const config = vscode.workspace.getConfiguration("lldb-dap", workspaceFolder);
-  const configPath = config.get<string>("executable-path");
+  const configPath = expandUser(config.get<string>("executable-path") ?? "");
   if (configPath && configPath.length !== 0) {
     if (!(await isExecutable(configPath))) {
       throw new ErrorWithNotification(
diff --git a/lldb/tools/lldb-dap/src-ts/utils.ts b/lldb/tools/lldb-dap/src-ts/utils.ts
new file mode 100644
index 0000000000000..efebe0b0f42ba
--- /dev/null
+++ b/lldb/tools/lldb-dap/src-ts/utils.ts
@@ -0,0 +1,41 @@
+import * as os from "os";
+import * as path from "path";
+
+/**
+ * Expands the character `~` to the user's home directory
+ */
+export function expandUser(file_path: string): string {
+  if (os.platform() == "win32") {
+    return file_path;
+  }
+
+  if (!file_path) {
+    return "";
+  }
+
+  if (!file_path.startsWith("~")) {
+    return file_path;
+  }
+
+  const path_len = file_path.length;
+  if (path_len == 1) {
+    return os.homedir();
+  }
+
+  if (file_path.charAt(1) == path.sep) {
+    return path.join(os.homedir(), file_path.substring(1));
+  }
+
+  const sep_index = file_path.indexOf(path.sep);
+  const user_name_end = sep_index == -1 ? file_path.length : sep_index;
+  const user_name = file_path.substring(1, user_name_end);
+  try {
+    if (user_name == os.userInfo().username) {
+      return path.join(os.homedir(), file_path.substring(user_name_end));
+    }
+  } catch (err) {
+    return file_path;
+  }
+
+  return file_path;
+}

From 3d0a3674d9ae52ed685ce467a48653cc27a2e5eb Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com>
Date: Wed, 5 Nov 2025 20:17:13 +0100
Subject: [PATCH 49/61] [libc] Make errno asserts noop on gpu targets (#166606)

This patch defines errno unit and integration test asserts as noop on
GPU targets. Checking for errnos is tests has caused build breakages in
previous patches.
---
 libc/test/IntegrationTest/CMakeLists.txt | 1 +
 libc/test/IntegrationTest/test.h         | 7 +++++++
 libc/test/UnitTest/CMakeLists.txt        | 1 +
 libc/test/UnitTest/ErrnoCheckingTest.h   | 7 +++++++
 4 files changed, 16 insertions(+)

diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
index 235e9fe2f55ee..d0752ea178429 100644
--- a/libc/test/IntegrationTest/CMakeLists.txt
+++ b/libc/test/IntegrationTest/CMakeLists.txt
@@ -14,5 +14,6 @@ add_object_library(
     libc.hdr.stdint_proxy
     libc.src.__support.OSUtil.osutil
     libc.src.__support.CPP.atomic
+    libc.src.__support.macros.properties.architectures
     ${arch_specific_deps}
 )
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 4a03f7aa6318b..9f5a3dfb3583c 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/OSUtil/exit.h"
 #include "src/__support/OSUtil/io.h"
+#include "src/__support/macros/properties/architectures.h"
 
 #define __AS_STRING(val) #val
 #define __CHECK_TRUE(file, line, val, should_exit)                             \
@@ -68,9 +69,15 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(errno))
 #define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(errno))
 #define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(errno))
+#endif
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 31d1e9dce8204..3197b3d7fd01b 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -204,5 +204,6 @@ add_header_library(
     ErrnoCheckingTest.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.properties.architectures
     libc.src.errno.errno
 )
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 5b1bc9441d830..111d812c58612 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -11,11 +11,17 @@
 
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/Test.h"
 
 // Define macro to validate the value stored in the errno and restore it
 // to zero.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL)                                                   \
   do {                                                                         \
     ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
@@ -27,6 +33,7 @@
     ASSERT_NE(0, static_cast<int>(libc_errno));                                \
     libc_errno = 0;                                                            \
   } while (0)
+#endif
 
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {

From e2d2affc70a8191ea67eee697e83ef4834c6b4a8 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Wed, 5 Nov 2025 11:21:52 -0800
Subject: [PATCH 50/61] [AMDGPU][LowerBufferFatPointers] Fix crash with `select
 false` (#166471)

If the input to LowerBufferFatPointers is such that the resource- and
offset-specific `select` instructions generated for a `select` on `ptr
addrspae(7)` fold away, the pass would crash when trying to replace an
instruction with itself. This commit resolves the issue.

Fixes https://github.com/iree-org/iree/issues/22551
---
 .../Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp   |  7 +++++--
 .../AMDGPU/lower-buffer-fat-pointers-control-flow.ll | 12 ++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a5913293238a..fdff21b6ef8df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
-          ConditionalTemps.push_back(RsrcInst);
-          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          // Guard against conditionals that were already folded away.
+          if (RsrcInst != *MaybeRsrc) {
+            ConditionalTemps.push_back(RsrcInst);
+            RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          }
         }
         for (Value *V : Seen)
           FoundRsrcs[V] = *MaybeRsrc;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
index 4fa7c29bfde02..71005224dd1e5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
@@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) {
   %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ]
   br label %.loopexit
 }
+
+;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select.
+define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) {
+; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0
+; CHECK-NEXT:    [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[RET]]
+;
+  %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x
+  ret ptr addrspace(7) %ret
+}

From 1041423393ff64834df793a8bd982fa6c898d5d8 Mon Sep 17 00:00:00 2001
From: SKill <skill@google.com>
Date: Wed, 5 Nov 2025 20:29:50 +0100
Subject: [PATCH 51/61] [clang][SourceManager] Reuse code when computing Column
 and Line numbers (#166593)

---
 clang/include/clang/Basic/SourceManager.h | 20 +++++++++++++----
 clang/lib/Basic/SourceManager.cpp         | 27 +++++++----------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index 6d9d074d78026..bc9e97863556d 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1409,10 +1409,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// before calling this method.
   unsigned getColumnNumber(FileID FID, unsigned FilePos,
                            bool *Invalid = nullptr) const;
+  unsigned getColumnNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
   unsigned getSpellingColumnNumber(SourceLocation Loc,
-                                   bool *Invalid = nullptr) const;
+                                   bool *Invalid = nullptr) const {
+    return getColumnNumber(getSpellingLoc(Loc), Invalid);
+  }
   unsigned getExpansionColumnNumber(SourceLocation Loc,
-                                    bool *Invalid = nullptr) const;
+                                    bool *Invalid = nullptr) const {
+    return getColumnNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedColumnNumber(SourceLocation Loc,
                                    bool *Invalid = nullptr) const;
 
@@ -1423,8 +1428,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// MemoryBuffer, so this is not cheap: use only when about to emit a
   /// diagnostic.
   unsigned getLineNumber(FileID FID, unsigned FilePos, bool *Invalid = nullptr) const;
-  unsigned getSpellingLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
-  unsigned getExpansionLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getSpellingLineNumber(SourceLocation Loc,
+                                 bool *Invalid = nullptr) const {
+    return getLineNumber(getSpellingLoc(Loc), Invalid);
+  }
+  unsigned getExpansionLineNumber(SourceLocation Loc,
+                                  bool *Invalid = nullptr) const {
+    return getLineNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
 
   /// Return the filename or buffer identifier of the buffer the
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 7dc81c50f87a2..b6cc6ec9365f5 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1159,17 +1159,11 @@ static bool isInvalid(LocType Loc, bool *Invalid) {
   return MyInvalid;
 }
 
-unsigned SourceManager::getSpellingColumnNumber(SourceLocation Loc,
-                                                bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
-}
-
-unsigned SourceManager::getExpansionColumnNumber(SourceLocation Loc,
-                                                 bool *Invalid) const {
+unsigned SourceManager::getColumnNumber(SourceLocation Loc,
+                                        bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
 }
 
@@ -1367,18 +1361,13 @@ unsigned SourceManager::getLineNumber(FileID FID, unsigned FilePos,
   return LineNo;
 }
 
-unsigned SourceManager::getSpellingLineNumber(SourceLocation Loc,
-                                              bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getLineNumber(LocInfo.first, LocInfo.second);
-}
-unsigned SourceManager::getExpansionLineNumber(SourceLocation Loc,
-                                               bool *Invalid) const {
+unsigned SourceManager::getLineNumber(SourceLocation Loc, bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getLineNumber(LocInfo.first, LocInfo.second);
 }
+
 unsigned SourceManager::getPresumedLineNumber(SourceLocation Loc,
                                               bool *Invalid) const {
   PresumedLoc PLoc = getPresumedLoc(Loc);

From db6231b4c2e18bb5fc107624e9c9071b02124844 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86@yahoo.com>
Date: Wed, 5 Nov 2025 11:52:56 -0800
Subject: [PATCH 52/61] [AMDGPU][MC] GFX9 - Support NV bit in FLAT instructions
 in pre-GFX90A (#154237) targets

This patch enables support of the NV (non-volatile) bit in FLAT
instructions in GFX9 (pre-GFX90A) targets.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  12 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  14 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td    | 140 ++-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |   8 +-
 llvm/test/MC/AMDGPU/gfx90a_err.s              |  43 +
 llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s         |  52 +-
 llvm/test/MC/AMDGPU/gfx942_err.s              |  28 +
 llvm/test/MC/AMDGPU/gfx9_asm_flat.s           | 858 +++++++++++++++++
 .../test/MC/Disassembler/AMDGPU/gfx9_flat.txt | 864 ++++++++++++++++++
 9 files changed, 1941 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..4fe194c813c46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,6 +2366,18 @@ def isGFX8GFX9NotGFX90A :
             " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
 
+// Pre-90A GFX9s allow the NV bit in FLAT instructions.
+def isNVAllowedInFlat :
+  Predicate<"!Subtarget->hasGFX90AInsts() &&"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
+
+// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
+def isNVNotAllowedInFlat :
+  Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
+            " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
+  AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
+
 def isGFX90AOnly :
   Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 09338c533fdf2..2808c44c59c11 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,6 +1602,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
 
+  bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
+    uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+    return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
+  }
+
   AMDGPUTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5370,7 +5375,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
       Error(S, "scale_offset is not supported on this GPU");
     }
-    if (CPol & CPol::NV) {
+    if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7145,6 +7150,13 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
   unsigned Enabled = 0, Seen = 0;
   for (;;) {
     SMLoc S = getLoc();
+
+    if (isGFX9() && trySkipId("nv")) {
+      Enabled |= CPol::NV;
+      Seen |= CPol::NV;
+      continue;
+    }
+
     bool Disabling;
     unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
     if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ea64d17417f7..6ef224148e44b 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<7> saddr;
   bits<10> vdst;
 
-  bits<5> cpol;
+  bits<6> cpol;
 
   // Only valid on gfx9
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2693,29 +2693,52 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
                   !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
 }
 
+class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+  FLAT_Real_vi <op, ps, has_sccb> {
+  let AssemblerPredicate = isNVNotAllowedInFlat;
+}
+
+class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+  FLAT_Real_vi <op, ps, has_sccb> {
+  let AssemblerPredicate = isNVAllowedInFlat;
+  let Subtarget = SIEncodingFamily.GFX9;
+  let DecoderNamespace = "GFX9";
+  let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
+}
+
+multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
+  def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
+  def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
+}
+
 multiclass FLAT_Real_AllAddr_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+}
+
+multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
 }
 
 class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
   FLAT_Real <op, ps>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
   let AssemblerPredicate = isGFX940Plus;
-  let DecoderNamespace = "GFX9";
+  let DecoderNamespace = "GFX940";
   let Inst{13} = ps.sve;
   let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
 }
 
 multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
-  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
-    let AssemblerPredicate = isGFX8GFX9NotGFX940;
-    let OtherPredicates = [isGFX8GFX9NotGFX940];
-  }
-  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
-    let DecoderNamespace = "GFX9";
+  let OtherPredicates = [isGFX8GFX9NotGFX940] in {
+    defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
   }
+
+  defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+
   let AssemblerPredicate = isGFX940Plus in {
     def _VE_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
     def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2728,11 +2751,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
 
   let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
-      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
+    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
+      defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
     }
-    def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
-      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
+    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
+      defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
     }
   }
 
@@ -2748,47 +2771,66 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
   def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
-def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
-def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
-def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
-def FLAT_LOAD_SSHORT_vi        : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
-def FLAT_LOAD_DWORD_vi         : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
-def FLAT_LOAD_DWORDX2_vi       : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
-def FLAT_LOAD_DWORDX4_vi       : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
-def FLAT_LOAD_DWORDX3_vi       : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-def FLAT_STORE_BYTE_vi         : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
-def FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-def FLAT_STORE_SHORT_vi        : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
-def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-def FLAT_STORE_DWORD_vi        : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
-def FLAT_STORE_DWORDX2_vi      : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
-def FLAT_STORE_DWORDX4_vi      : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
-def FLAT_STORE_DWORDX3_vi      : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-def FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-def FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
-def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+defm FLAT_LOAD_UBYTE_vi         : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
+defm FLAT_LOAD_SBYTE_vi         : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
+defm FLAT_LOAD_USHORT_vi        : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
+defm FLAT_LOAD_SSHORT_vi        : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
+defm FLAT_LOAD_DWORD_vi         : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
+defm FLAT_LOAD_DWORDX2_vi       : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
+defm FLAT_LOAD_DWORDX4_vi       : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
+defm FLAT_LOAD_DWORDX3_vi       : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+defm FLAT_STORE_BYTE_vi         : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
+defm FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+defm FLAT_STORE_SHORT_vi        : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
+defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+defm FLAT_STORE_DWORD_vi        : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
+defm FLAT_STORE_DWORDX2_vi      : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
+defm FLAT_STORE_DWORDX4_vi      : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
+defm FLAT_STORE_DWORDX3_vi      : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+defm FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+defm FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+defm FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
+defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
 multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
   defvar ps = !cast<FLAT_Pseudo>(NAME);
-  def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-  def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
+  def _vi     : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+
+  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
 }
 
 multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
   FLAT_Real_AllAddr_vi<op, has_sccb> {
-  def _RTN_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+  defm _RTN  : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+
+  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
+  FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
+  def _RTN_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
 
-  def _RTN_agpr_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
 }
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
@@ -2950,10 +2992,10 @@ let AssemblerPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
-  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
 } // End AssemblerPredicate = isGFX940Plus
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a4befa5..3e6f35dbf5e54 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,8 +186,12 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     O << " dlc";
   if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
     O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
-  if (Imm & ~CPol::ALL_pregfx12)
-    O << " /* unexpected cache policy bit */";
+  if (Imm & ~CPol::ALL_pregfx12) {
+    if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
+      O << " nv";
+    else
+      O << " /* unexpected cache policy bit */";
+  }
 }
 
 void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
index ff0dfb371bbbf..78e4f86ec1b90 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_err.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -674,3 +674,46 @@ v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
+// nv bit in FLAT instructions
+flat_load_ubyte v5, v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_load_ubyte a5, v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], v5 offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], a5 offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_load_ubyte v5, v[2:3], off offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_store_byte v[2:3], v5, off offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_add v[2:3], v5, off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap a1, v[2:3], a2, off glc nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap_x2 v[2:3], v[4:5], off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap_x2 v[2:3], a[4:5], off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_ubyte v5, off, s2 offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_ubyte a5, off, s2 offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_store_dword v2, v3, off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index c96a72ddc2573..3af0d83fb3056 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -706,107 +706,107 @@ flat_load_short_d16_hi a5, v[2:3] offset:4095 glc
 flat_load_short_d16_hi a5, v[2:3] offset:4095 slc
 
 // GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_swap a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_add a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_sub a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_and a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_or a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_xor a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_inc a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_dec a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc
 
 // GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx942_err.s b/llvm/test/MC/AMDGPU/gfx942_err.s
index fd59a01b34a04..dc51bab65aa04 100644
--- a/llvm/test/MC/AMDGPU/gfx942_err.s
+++ b/llvm/test/MC/AMDGPU/gfx942_err.s
@@ -125,3 +125,31 @@ global_load_dword v[2:3], off lds
 
 scratch_load_dword v2, off lds
 // GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// nv bit in FLAT instructions
+flat_load_ubyte v5, v[2:3] offset:4095 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], v5 offset:4095 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_atomic_add_f32 v[2:3], v5 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_load_dword v2, v[2:3], off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_store_dword v[2:3], v5 off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_add_f64 v[0:1], v[2:3], off sc1 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap v0, v[2:3], v5 off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_lds_dword v2, off nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_store_dword v2, v3, off nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
index 5cc3d2533a149..7687c0a478bd9 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
@@ -24,6 +24,18 @@ flat_load_ubyte v5, v[1:2] offset:4095 glc
 flat_load_ubyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 
@@ -48,6 +60,18 @@ flat_load_sbyte v5, v[1:2] offset:4095 glc
 flat_load_sbyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_ushort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 
@@ -72,6 +96,18 @@ flat_load_ushort v5, v[1:2] offset:4095 glc
 flat_load_ushort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ushort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sshort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -96,6 +132,18 @@ flat_load_sshort v5, v[1:2] offset:4095 glc
 flat_load_sshort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sshort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dword v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 
@@ -120,6 +168,18 @@ flat_load_dword v5, v[1:2] offset:4095 glc
 flat_load_dword v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dword v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 
@@ -144,6 +204,18 @@ flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx2 v[5:6], v[1:2] nv
+// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 
@@ -168,6 +240,18 @@ flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx3 v[5:7], v[1:2] nv
+// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -192,6 +276,18 @@ flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx4 v[5:8], v[1:2] nv
+// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_store_byte v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 
@@ -216,6 +312,18 @@ flat_store_byte v[1:2], v2 offset:4095 glc
 flat_store_byte v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_byte v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_byte_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 
@@ -240,6 +348,18 @@ flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_byte_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_short v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 
@@ -264,6 +384,18 @@ flat_store_short v[1:2], v2 offset:4095 glc
 flat_store_short v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_short v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_short_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -288,6 +420,18 @@ flat_store_short_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_short_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_short_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dword v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 
@@ -312,6 +456,18 @@ flat_store_dword v[1:2], v2 offset:4095 glc
 flat_store_dword v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dword v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 
@@ -336,6 +492,18 @@ flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx2 v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095
 // CHECK: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 
@@ -360,6 +528,18 @@ flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx3 v[1:2], v[2:4] nv
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv
+// CHECK: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095
 // CHECK: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -384,6 +564,18 @@ flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx4 v[1:2], v[2:5] nv
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv
+// CHECK: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
+
 flat_load_ubyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 
@@ -408,6 +600,18 @@ flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 
@@ -432,6 +636,18 @@ flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 
@@ -456,6 +672,18 @@ flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -480,6 +708,18 @@ flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_short_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 
@@ -504,6 +744,18 @@ flat_load_short_d16 v5, v[1:2] offset:4095 glc
 flat_load_short_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_short_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_short_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 
@@ -528,6 +780,18 @@ flat_load_short_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_short_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_short_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
+
 flat_atomic_swap v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 
@@ -552,6 +816,18 @@ flat_atomic_swap v0, v[1:2], v2 offset:4095 glc
 flat_atomic_swap v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_swap v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 
@@ -576,6 +852,18 @@ flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_cmpswap v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_add v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 
@@ -600,6 +888,18 @@ flat_atomic_add v0, v[1:2], v2 offset:4095 glc
 flat_atomic_add v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_add v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_sub v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 
@@ -1197,6 +1497,18 @@ global_load_ubyte v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1242,6 +1554,18 @@ global_load_sbyte v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_ushort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1287,6 +1611,18 @@ global_load_ushort v5, v1, s[4:5] offset:-1 glc
 global_load_ushort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ushort v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sshort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1332,6 +1668,18 @@ global_load_sshort v5, v1, s[4:5] offset:-1 glc
 global_load_sshort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sshort v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dword v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1377,6 +1725,18 @@ global_load_dword v5, v1, s[4:5] offset:-1 glc
 global_load_dword v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dword v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1422,6 +1782,18 @@ global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx2 v[5:6], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1467,6 +1839,15 @@ global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx3 v[5:7], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1512,6 +1893,15 @@ global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx4 v[5:8], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
+
 global_store_byte v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1557,6 +1947,18 @@ global_store_byte v1, v2, s[6:7] offset:-1 glc
 global_store_byte v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_byte v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1602,6 +2004,18 @@ global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_byte_d16_hi v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_short v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1647,6 +2061,18 @@ global_store_short v1, v2, s[6:7] offset:-1 glc
 global_store_short v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_short v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1692,6 +2118,18 @@ global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_short_d16_hi v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dword v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1737,6 +2175,18 @@ global_store_dword v1, v2, s[6:7] offset:-1 glc
 global_store_dword v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dword v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1782,6 +2232,18 @@ global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx2 v1, v[2:3], s[6:7] nv
+// CHECK: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1827,6 +2289,18 @@ global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx3 v1, v[2:4], s[6:7] nv
+// CHECK: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1872,6 +2346,18 @@ global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx4 v1, v[2:5], s[6:7] nv
+// CHECK: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
+
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1917,6 +2403,18 @@ global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1962,6 +2460,18 @@ global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2007,6 +2517,18 @@ global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2052,6 +2574,18 @@ global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_short_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2097,6 +2631,18 @@ global_load_short_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_short_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2142,6 +2688,18 @@ global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_short_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
+
 global_atomic_swap v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2187,6 +2745,18 @@ global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_swap v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_swap v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2232,6 +2802,18 @@ global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_cmpswap v1, v[2:3], s[6:7] nv
+// CHECK: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_add v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2277,6 +2859,18 @@ global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_add v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_add v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_sub v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x06,0x00]
 
@@ -3357,6 +3951,18 @@ scratch_load_ubyte v5, off, s2 offset:-1 glc
 scratch_load_ubyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte v5, off, s2 nv
+// CHECK: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3402,6 +4008,18 @@ scratch_load_sbyte v5, off, s2 offset:-1 glc
 scratch_load_sbyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte v5, off, s2 nv
+// CHECK: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_ushort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3447,6 +4065,18 @@ scratch_load_ushort v5, off, s2 offset:-1 glc
 scratch_load_ushort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ushort v5, off, s2 nv
+// CHECK: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sshort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3492,6 +4122,18 @@ scratch_load_sshort v5, off, s2 offset:-1 glc
 scratch_load_sshort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sshort v5, off, s2 nv
+// CHECK: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dword v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3537,6 +4179,18 @@ scratch_load_dword v5, off, s2 offset:-1 glc
 scratch_load_dword v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dword v5, off, s2 nv
+// CHECK: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3582,6 +4236,18 @@ scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx2 v[5:6], off, s2 nv
+// CHECK: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3627,6 +4293,18 @@ scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx3 v[5:7], off, s2 nv
+// CHECK: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3672,6 +4350,18 @@ scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx4 v[5:8], off, s2 nv
+// CHECK: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_store_byte off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3717,6 +4407,18 @@ scratch_store_byte off, v2, s3 offset:-1 glc
 scratch_store_byte off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_byte off, v2, s3 nv
+// CHECK: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_byte_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3762,6 +4464,18 @@ scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_byte_d16_hi off, v2, s3 nv
+// CHECK: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_short off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3807,6 +4521,18 @@ scratch_store_short off, v2, s3 offset:-1 glc
 scratch_store_short off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_short off, v2, s3 nv
+// CHECK: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_short_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3852,6 +4578,18 @@ scratch_store_short_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_short_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_short_d16_hi off, v2, s3 nv
+// CHECK: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dword off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3897,6 +4635,18 @@ scratch_store_dword off, v2, s3 offset:-1 glc
 scratch_store_dword off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dword off, v2, s3 nv
+// CHECK: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1
 // CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3942,6 +4692,18 @@ scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx2 off, v[2:3], s3 nv
+// CHECK: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1
 // CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3987,6 +4749,18 @@ scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx3 off, v[2:4], s3 nv
+// CHECK: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1
 // CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4032,6 +4806,18 @@ scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx4 off, v[2:5], s3 nv
+// CHECK: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_load_ubyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4077,6 +4863,18 @@ scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4122,6 +4920,18 @@ scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4167,6 +4977,18 @@ scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4212,6 +5034,18 @@ scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_short_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4254,6 +5088,18 @@ scratch_load_short_d16 v5, off, s2 offset:-4096
 scratch_load_short_d16 v5, off, s2 offset:-1 glc
 // CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_short_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_short_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4302,6 +5148,18 @@ scratch_load_short_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_short_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_short_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+
 global_load_dword v[2:3], off lds
 // CHECK: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
index 0ee659e207c91..4c06585a4c2eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
@@ -21,6 +21,18 @@
 # CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05
 
@@ -42,6 +54,18 @@
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05
 
@@ -63,6 +87,18 @@
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ushort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05
 
@@ -84,6 +120,18 @@
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sshort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dword v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05
 
@@ -105,6 +153,18 @@
 # CHECK: flat_load_dword v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dword v5, v[1:2] nv           ; encoding: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05
 
@@ -126,6 +186,18 @@
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] nv     ; encoding: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05
 
@@ -147,6 +219,18 @@
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] nv     ; encoding: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05
 
@@ -168,6 +252,18 @@
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] nv     ; encoding: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_store_byte v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00
 
@@ -189,6 +285,18 @@
 # CHECK: flat_store_byte v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_byte v[1:2], v2 nv           ; encoding: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00
 
@@ -210,6 +318,18 @@
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 nv    ; encoding: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_short v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00
 
@@ -231,6 +351,18 @@
 # CHECK: flat_store_short v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_short v[1:2], v2 nv          ; encoding: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00
 
@@ -252,6 +384,18 @@
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_short_d16_hi v[1:2], v2 nv   ; encoding: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00
 
@@ -273,6 +417,18 @@
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dword v[1:2], v2 nv          ; encoding: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00
 
@@ -294,6 +450,18 @@
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] nv    ; encoding: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00
 
@@ -315,6 +483,18 @@
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] nv    ; encoding: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv ; encoding: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv ; encoding: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00
 
@@ -336,6 +516,18 @@
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] nv    ; encoding: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv ; encoding: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv ; encoding: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05
 
@@ -357,6 +549,18 @@
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05
 
@@ -378,6 +582,18 @@
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05
 
@@ -399,6 +615,18 @@
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05
 
@@ -420,6 +648,18 @@
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05
 
@@ -441,6 +681,18 @@
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_short_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05
 
@@ -462,6 +714,18 @@
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_short_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00
 
@@ -483,6 +747,18 @@
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_swap v[1:2], v2 nv          ; encoding: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00
 
@@ -504,6 +780,18 @@
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] nv   ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00
 
@@ -525,6 +813,18 @@
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_add v[1:2], v2 nv           ; encoding: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_sub v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00
 
@@ -1017,6 +1317,18 @@
 # CHECK: global_load_ubyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1026,6 +1338,18 @@
 # CHECK: global_load_sbyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_ushort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1035,6 +1359,18 @@
 # CHECK: global_load_ushort v5, v[1:2], off      ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ushort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sshort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1044,6 +1380,18 @@
 # CHECK: global_load_sshort v5, v[1:2], off      ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sshort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dword v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1053,6 +1401,18 @@
 # CHECK: global_load_dword v5, v[1:2], off       ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dword v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1062,6 +1422,18 @@
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] nv ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1071,6 +1443,18 @@
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] nv ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1080,6 +1464,18 @@
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] nv ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_store_byte v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1089,6 +1485,18 @@
 # CHECK: global_store_byte v[1:2], v2, off       ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_byte v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1098,6 +1506,18 @@
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_short v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1107,6 +1527,18 @@
 # CHECK: global_store_short v[1:2], v2, off      ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_short v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_short_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1116,6 +1548,18 @@
 # CHECK: global_store_short_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dword v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1125,6 +1569,18 @@
 # CHECK: global_store_dword v[1:2], v2, off      ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dword v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1134,6 +1590,18 @@
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1143,6 +1611,18 @@
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] nv ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1152,6 +1632,18 @@
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] nv ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1161,6 +1653,18 @@
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1170,6 +1674,18 @@
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1179,6 +1695,18 @@
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1188,6 +1716,18 @@
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_short_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1197,6 +1737,18 @@
 # CHECK: global_load_short_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_short_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_short_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1206,6 +1758,18 @@
 # CHECK: global_load_short_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_atomic_swap v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1215,6 +1779,18 @@
 # CHECK: global_atomic_swap v[1:2], v2, off      ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00
 
+# CHECK: global_atomic_swap v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_cmpswap v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1236,6 +1812,18 @@
 # CHECK: global_atomic_cmpswap v1, v[2:3], v[4:5], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01]
 0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01
 
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_add v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1245,6 +1833,18 @@
 # CHECK: global_atomic_add v[1:2], v2, off       ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00
 
+# CHECK: global_atomic_add v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_sub v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1503,6 +2103,18 @@
 # CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05
 
@@ -1542,6 +2154,18 @@
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05
 
@@ -1581,6 +2205,18 @@
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ushort v5, off, s2 nv      ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05
 
@@ -1620,6 +2256,18 @@
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sshort v5, off, s2 nv      ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05
 
@@ -1659,6 +2307,18 @@
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dword v5, off, s2 nv       ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05
 
@@ -1698,6 +2358,18 @@
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 nv ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05
 
@@ -1737,6 +2409,18 @@
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 nv ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05
 
@@ -1776,6 +2460,18 @@
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 nv ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00
 
@@ -1815,6 +2511,18 @@
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_byte off, v2, s3 nv       ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00
 
@@ -1854,6 +2562,18 @@
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_short off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00
 
@@ -1893,6 +2613,18 @@
 # CHECK: scratch_store_short off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_short off, v2, s3 nv      ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00
 
@@ -1932,6 +2664,18 @@
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_short_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00
 
@@ -1971,6 +2715,18 @@
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dword off, v2, s3 nv      ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00
 
@@ -2010,6 +2766,18 @@
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 nv ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00
 
@@ -2049,6 +2817,18 @@
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 nv ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2088,6 +2868,18 @@
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 nv ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05
 
@@ -2127,6 +2919,18 @@
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05
 
@@ -2166,6 +2970,18 @@
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05
 
@@ -2205,6 +3021,18 @@
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2244,6 +3072,18 @@
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05
 
@@ -2283,6 +3123,18 @@
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_short_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05
 
@@ -2322,6 +3174,18 @@
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_short_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: global_load_dword v[2:3], off lds       ; encoding: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00
 

From 6c4f9688082361a5c5d57aa1e6d368dfc4aeea75 Mon Sep 17 00:00:00 2001
From: Hannes Braun <hannes@hannesbraun.net>
Date: Wed, 5 Nov 2025 20:55:32 +0100
Subject: [PATCH 53/61] [clang-format] Fix brace wrapping for Java records
 (#164711)

The brace wrapping for Java records should now behave similar to
classes. Before, opening braces for Java records were always placed in
the same line as the record definition.
---
 clang/lib/Format/UnwrappedLineFormatter.cpp |  8 +++++---
 clang/lib/Format/UnwrappedLineParser.cpp    | 12 ++++++++----
 clang/unittests/Format/FormatTestJava.cpp   | 13 +++++++++++++
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index ac9c81d4416c9..d31d656a63fc5 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -285,7 +285,8 @@ class LineJoiner {
       if (Tok && Tok->is(tok::kw_typedef))
         Tok = Tok->getNextNonComment();
       if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union,
-                              tok::kw_extern, Keywords.kw_interface)) {
+                              tok::kw_extern, Keywords.kw_interface,
+                              Keywords.kw_record)) {
         return !Style.BraceWrapping.SplitEmptyRecord && EmptyBlock
                    ? tryMergeSimpleBlock(I, E, Limit)
                    : 0;
@@ -498,7 +499,8 @@ class LineJoiner {
         ShouldMerge = Style.AllowShortEnumsOnASingleLine;
       } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
         ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
-      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) {
+      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                                        TT_RecordLBrace)) {
         // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
         // and structs, but it seems that wrapping is still handled correctly
         // elsewhere.
@@ -507,7 +509,7 @@ class LineJoiner {
                        !Style.BraceWrapping.SplitEmptyRecord);
       } else if (TheLine->InPPDirective ||
                  TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
-                                          tok::kw_struct)) {
+                                          tok::kw_struct, Keywords.kw_record)) {
         // Try to merge a block with left brace unwrapped that wasn't yet
         // covered.
         ShouldMerge = !Style.BraceWrapping.AfterFunction ||
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 5e2584edac8f4..8b7dd02d548af 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -948,7 +948,11 @@ static bool isIIFE(const UnwrappedLine &Line,
 }
 
 static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
-                                   const FormatToken &InitialToken) {
+                                   const FormatToken &InitialToken,
+                                   const bool IsJavaRecord) {
+  if (IsJavaRecord)
+    return Style.BraceWrapping.AfterClass;
+
   tok::TokenKind Kind = InitialToken.Tok.getKind();
   if (InitialToken.is(TT_NamespaceMacro))
     Kind = tok::kw_namespace;
@@ -3200,7 +3204,7 @@ void UnwrappedLineParser::parseNamespace() {
   if (FormatTok->is(tok::l_brace)) {
     FormatTok->setFinalizedType(TT_NamespaceLBrace);
 
-    if (ShouldBreakBeforeBrace(Style, InitialToken))
+    if (ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false))
       addUnwrappedLine();
 
     unsigned AddLevels =
@@ -3865,7 +3869,7 @@ bool UnwrappedLineParser::parseEnum() {
   }
 
   if (!Style.AllowShortEnumsOnASingleLine &&
-      ShouldBreakBeforeBrace(Style, InitialToken)) {
+      ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false)) {
     addUnwrappedLine();
   }
   // Parse enum body.
@@ -4160,7 +4164,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr, bool IsJavaRecord) {
     if (ParseAsExpr) {
       parseChildBlock();
     } else {
-      if (ShouldBreakBeforeBrace(Style, InitialToken))
+      if (ShouldBreakBeforeBrace(Style, InitialToken, IsJavaRecord))
         addUnwrappedLine();
 
       unsigned AddLevels = Style.IndentAccessModifiers ? 2u : 1u;
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 1416614bae29a..3cc97e2dc0b2e 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -848,6 +848,19 @@ TEST_F(FormatTestJava, TextBlock) {
                  "              Pat Q. Smith");
 }
 
+TEST_F(FormatTestJava, BreakAfterRecord) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  Style.EmptyLineBeforeAccessModifier = FormatStyle::ELBAMS_Never;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  Style.BraceWrapping.SplitEmptyRecord = true;
+
+  verifyFormat("public record Foo(int i)\n"
+               "{\n"
+               "}",
+               "public record Foo(int i) {}", Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format

From 0469ff0a212d7f3dea464c52e19d56e22b5af858 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 5 Nov 2025 11:55:59 -0800
Subject: [PATCH 54/61] TableGen: Split RuntimeLibcallsEmitter into separate
 utility header (#166583)

This information will be needed in more emitters, so start factoring
it to be more reusable.
---
 llvm/utils/TableGen/Basic/CMakeLists.txt      |   1 +
 llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp |  93 ++++++
 llvm/utils/TableGen/Basic/RuntimeLibcalls.h   | 189 ++++++++++++
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 275 ++----------------
 4 files changed, 308 insertions(+), 250 deletions(-)
 create mode 100644 llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
 create mode 100644 llvm/utils/TableGen/Basic/RuntimeLibcalls.h

diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index b4a66ecce6440..2030e9add7f30 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLV
   IntrinsicEmitter.cpp
   RISCVTargetDefEmitter.cpp
   RuntimeLibcallsEmitter.cpp
+  RuntimeLibcalls.cpp
   SDNodeProperties.cpp
   TableGen.cpp
   TargetFeaturesEmitter.cpp
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
new file mode 100644
index 0000000000000..1e609a2a8880b
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RuntimeLibcalls.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+RuntimeLibcalls::RuntimeLibcalls(const RecordKeeper &Records) {
+  ArrayRef<const Record *> AllRuntimeLibcalls =
+      Records.getAllDerivedDefinitions("RuntimeLibcall");
+
+  RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
+
+  size_t CallTypeEnumVal = 0;
+  for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
+    RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
+    Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
+  }
+
+  for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
+    Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
+
+  ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
+      Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
+
+  SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
+      AllRuntimeLibcallImplsRaw);
+
+  // Sort by libcall impl name and secondarily by the enum name.
+  sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
+    return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
+           std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
+  });
+
+  RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
+
+  size_t LibCallImplEnumVal = 1;
+  for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
+    RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
+                                           LibCallImplEnumVal++);
+
+    const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
+    Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
+
+    if (LibCallImpl.isDefault()) {
+      const RuntimeLibcall *Provides = LibCallImpl.getProvides();
+      if (!Provides)
+        PrintFatalError(LibCallImplDef->getLoc(),
+                        "default implementations must provide a libcall");
+      LibCallToDefaultImpl[Provides] = &LibCallImpl;
+    }
+  }
+}
+
+void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
+                                      SetTheory::RecSet &Elts) {
+  assert(Def->isSubClassOf("LibcallImpls"));
+
+  SetTheory::RecSet TmpElts;
+
+  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
+
+  Elts.insert(TmpElts.begin(), TmpElts.end());
+
+  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
+  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
+
+  // This is assuming we aren't conditionally applying a calling convention to
+  // some subsets, and not another, but this doesn't appear to be used.
+
+  for (const Record *LibcallImplDef : TmpElts) {
+    const RuntimeLibcallImpl *LibcallImpl =
+        Libcalls.getRuntimeLibcallImpl(LibcallImplDef);
+    if (!AP.isAlwaysAvailable() || CCClass) {
+      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
+      if (!Inserted) {
+        PrintError(
+            Def,
+            "combining nested libcall set predicates currently unhandled: '" +
+                LibcallImpl->getLibcallFuncName() + "'");
+      }
+
+      It->second.first.push_back(AP.getDef());
+      It->second.second = CCClass;
+    }
+  }
+}
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.h b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
new file mode 100644
index 0000000000000..6c9897602b2fa
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
@@ -0,0 +1,189 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+#define LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
+
+namespace llvm {
+
+class AvailabilityPredicate {
+  const Record *TheDef;
+  StringRef PredicateString;
+
+public:
+  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
+    if (TheDef)
+      PredicateString = TheDef->getValueAsString("Cond");
+  }
+
+  const Record *getDef() const { return TheDef; }
+
+  bool isAlwaysAvailable() const { return PredicateString.empty(); }
+
+  void emitIf(raw_ostream &OS) const {
+    OS << "if (" << PredicateString << ") {\n";
+  }
+
+  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
+
+  void emitTableVariableNameSuffix(raw_ostream &OS) const {
+    if (TheDef)
+      OS << '_' << TheDef->getName();
+  }
+};
+
+class RuntimeLibcalls;
+class RuntimeLibcallImpl;
+
+/// Used to apply predicates to nested sets of libcalls.
+struct LibcallPredicateExpander : SetTheory::Expander {
+  const RuntimeLibcalls &Libcalls;
+  DenseMap<const RuntimeLibcallImpl *,
+           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
+
+  LibcallPredicateExpander(
+      const RuntimeLibcalls &Libcalls,
+      DenseMap<const RuntimeLibcallImpl *,
+               std::pair<std::vector<const Record *>, const Record *>>
+          &Func2Preds)
+      : Libcalls(Libcalls), Func2Preds(Func2Preds) {}
+
+  void expand(SetTheory &ST, const Record *Def,
+              SetTheory::RecSet &Elts) override;
+};
+
+class RuntimeLibcall {
+  const Record *TheDef = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcall() = delete;
+  RuntimeLibcall(const Record *Def, size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    assert(Def);
+  }
+
+  ~RuntimeLibcall() { assert(TheDef); }
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::" << TheDef->getValueAsString("Name");
+  }
+};
+
+class RuntimeLibcallImpl {
+  const Record *TheDef;
+  const RuntimeLibcall *Provides = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcallImpl(
+      const Record *Def,
+      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
+      size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
+      Provides = ProvideMap.lookup(ProvidesDef);
+  }
+
+  ~RuntimeLibcallImpl() = default;
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  const RuntimeLibcall *getProvides() const { return Provides; }
+
+  StringRef getLibcallFuncName() const {
+    return TheDef->getValueAsString("LibCallFuncName");
+  }
+
+  const Record *getCallingConv() const {
+    return TheDef->getValueAsOptionalDef("CallingConv");
+  }
+
+  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
+    OS << '\"' << getLibcallFuncName() << '\"';
+  }
+
+  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::impl_" << this->getName();
+  }
+
+  void emitSetImplCall(raw_ostream &OS) const {
+    OS << "setLibcallImpl(";
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "); // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitTableEntry(raw_ostream &OS) const {
+    OS << '{';
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "}, // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitSetCallingConv(raw_ostream &OS) const {}
+};
+
+struct LibcallsWithCC {
+  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
+  const Record *CallingConv = nullptr;
+};
+
+class RuntimeLibcalls {
+private:
+  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
+  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
+
+  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
+  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
+
+  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
+      LibCallToDefaultImpl;
+
+public:
+  RuntimeLibcalls(const RecordKeeper &Records);
+
+  ArrayRef<RuntimeLibcall> getRuntimeLibcallDefList() const {
+    return RuntimeLibcallDefList;
+  }
+
+  ArrayRef<RuntimeLibcallImpl> getRuntimeLibcallImplDefList() const {
+    return RuntimeLibcallImplDefList;
+  }
+
+  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
+    return Def2RuntimeLibcall.lookup(Def);
+  }
+
+  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
+    return Def2RuntimeLibcallImpl.lookup(Def);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 001ca7b658d3c..7aca87a63d0a2 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -8,6 +8,8 @@
 
 #define DEBUG_TYPE "runtime-libcall-emitter"
 
+#include "RuntimeLibcalls.h"
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
@@ -65,160 +67,12 @@ template <> struct DenseMapInfo<PredicateWithCC, void> {
     return LHS == RHS;
   }
 };
-} // namespace llvm
-
-namespace {
-
-class AvailabilityPredicate {
-  const Record *TheDef;
-  StringRef PredicateString;
-
-public:
-  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
-    if (TheDef)
-      PredicateString = TheDef->getValueAsString("Cond");
-  }
-
-  const Record *getDef() const { return TheDef; }
-
-  bool isAlwaysAvailable() const { return PredicateString.empty(); }
-
-  void emitIf(raw_ostream &OS) const {
-    OS << "if (" << PredicateString << ") {\n";
-  }
-
-  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
-
-  void emitTableVariableNameSuffix(raw_ostream &OS) const {
-    if (TheDef)
-      OS << '_' << TheDef->getName();
-  }
-};
-
-class RuntimeLibcallEmitter;
-class RuntimeLibcallImpl;
-
-/// Used to apply predicates to nested sets of libcalls.
-struct LibcallPredicateExpander : SetTheory::Expander {
-  const RuntimeLibcallEmitter &LibcallEmitter;
-  DenseMap<const RuntimeLibcallImpl *,
-           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
-
-  LibcallPredicateExpander(
-      const RuntimeLibcallEmitter &LibcallEmitter,
-      DenseMap<const RuntimeLibcallImpl *,
-               std::pair<std::vector<const Record *>, const Record *>>
-          &Func2Preds)
-      : LibcallEmitter(LibcallEmitter), Func2Preds(Func2Preds) {}
-
-  void expand(SetTheory &ST, const Record *Def,
-              SetTheory::RecSet &Elts) override;
-};
-
-class RuntimeLibcall {
-  const Record *TheDef = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcall() = delete;
-  RuntimeLibcall(const Record *Def, size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    assert(Def);
-  }
-
-  ~RuntimeLibcall() { assert(TheDef); }
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::" << TheDef->getValueAsString("Name");
-  }
-};
-
-class RuntimeLibcallImpl {
-  const Record *TheDef;
-  const RuntimeLibcall *Provides = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcallImpl(
-      const Record *Def,
-      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
-      size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
-      Provides = ProvideMap.lookup(ProvidesDef);
-  }
-
-  ~RuntimeLibcallImpl() = default;
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  const RuntimeLibcall *getProvides() const { return Provides; }
-
-  StringRef getLibcallFuncName() const {
-    return TheDef->getValueAsString("LibCallFuncName");
-  }
-
-  const Record *getCallingConv() const {
-    return TheDef->getValueAsOptionalDef("CallingConv");
-  }
-
-  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
-    OS << '\"' << getLibcallFuncName() << '\"';
-  }
-
-  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::impl_" << this->getName();
-  }
-
-  void emitSetImplCall(raw_ostream &OS) const {
-    OS << "setLibcallImpl(";
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "); // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitTableEntry(raw_ostream &OS) const {
-    OS << '{';
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "}, // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitSetCallingConv(raw_ostream &OS) const {}
-};
-
-struct LibcallsWithCC {
-  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
-  const Record *CallingConv = nullptr;
-};
 
 class RuntimeLibcallEmitter {
 private:
   const RecordKeeper &Records;
-  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
-  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
-
-  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
-  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
-
-  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
-      LibCallToDefaultImpl;
+  RuntimeLibcalls Libcalls;
 
-private:
   void emitGetRuntimeLibcallEnum(raw_ostream &OS) const;
 
   void emitNameMatchHashTable(raw_ostream &OS,
@@ -229,61 +83,7 @@ class RuntimeLibcallEmitter {
   void emitSystemRuntimeLibrarySetCalls(raw_ostream &OS) const;
 
 public:
-  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R) {
-
-    ArrayRef<const Record *> AllRuntimeLibcalls =
-        Records.getAllDerivedDefinitions("RuntimeLibcall");
-
-    RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
-
-    size_t CallTypeEnumVal = 0;
-    for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
-      RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
-      Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
-    }
-
-    for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
-      Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
-
-    ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
-        Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
-
-    SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
-        AllRuntimeLibcallImplsRaw);
-
-    // Sort by libcall impl name and secondarily by the enum name.
-    sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
-      return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
-             std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
-    });
-
-    RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
-
-    size_t LibCallImplEnumVal = 1;
-    for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
-      RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
-                                             LibCallImplEnumVal++);
-
-      const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
-      Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
-
-      if (LibCallImpl.isDefault()) {
-        const RuntimeLibcall *Provides = LibCallImpl.getProvides();
-        if (!Provides)
-          PrintFatalError(LibCallImplDef->getLoc(),
-                          "default implementations must provide a libcall");
-        LibCallToDefaultImpl[Provides] = &LibCallImpl;
-      }
-    }
-  }
-
-  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
-    return Def2RuntimeLibcall.lookup(Def);
-  }
-
-  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
-    return Def2RuntimeLibcallImpl.lookup(Def);
-  }
+  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R), Libcalls(R) {}
 
   void run(raw_ostream &OS);
 };
@@ -297,24 +97,25 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
         "namespace RTLIB {\n"
         "enum Libcall : unsigned short {\n";
 
-  for (const RuntimeLibcall &LibCall : RuntimeLibcallDefList) {
+  for (const RuntimeLibcall &LibCall : Libcalls.getRuntimeLibcallDefList()) {
     StringRef Name = LibCall.getName();
     OS << "  " << Name << " = " << LibCall.getEnumVal() << ",\n";
   }
 
-  OS << "  UNKNOWN_LIBCALL = " << RuntimeLibcallDefList.size()
+  OS << "  UNKNOWN_LIBCALL = " << Libcalls.getRuntimeLibcallDefList().size()
      << "\n};\n\n"
         "enum LibcallImpl : unsigned short {\n"
         "  Unsupported = 0,\n";
 
-  for (const RuntimeLibcallImpl &LibCall : RuntimeLibcallImplDefList) {
+  for (const RuntimeLibcallImpl &LibCall :
+       Libcalls.getRuntimeLibcallImplDefList()) {
     OS << "  impl_" << LibCall.getName() << " = " << LibCall.getEnumVal()
        << ", // " << LibCall.getLibcallFuncName() << '\n';
   }
 
   OS << "};\n"
      << "constexpr size_t NumLibcallImpls = "
-     << RuntimeLibcallImplDefList.size() + 1
+     << Libcalls.getRuntimeLibcallImplDefList().size() + 1
      << ";\n"
         "} // End namespace RTLIB\n"
         "} // End namespace llvm\n";
@@ -394,6 +195,8 @@ constructPerfectHashTable(ArrayRef<RuntimeLibcallImpl> Keywords,
 /// Generate hash table based lookup by name.
 void RuntimeLibcallEmitter::emitNameMatchHashTable(
     raw_ostream &OS, StringToOffsetTable &OffsetTable) const {
+  ArrayRef<RuntimeLibcallImpl> RuntimeLibcallImplDefList =
+      Libcalls.getRuntimeLibcallImplDefList();
   std::vector<uint64_t> Hashes(RuntimeLibcallImplDefList.size());
   std::vector<unsigned> TableValues(RuntimeLibcallImplDefList.size());
   DenseSet<StringRef> SeenFuncNames;
@@ -495,7 +298,8 @@ void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
   {
     IfDefEmitter IfDef(OS, "GET_INIT_RUNTIME_LIBCALL_NAMES");
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
 
     Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
@@ -505,7 +309,8 @@ const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
 
     OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
                   ""); // Unsupported entry
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       StringRef ImplName = LibCallImpl.getLibcallFuncName();
       OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
     }
@@ -516,7 +321,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
 )";
 
     OS << "  0,\n";
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       OS << "  " << LibCallImpl.getLibcallFuncName().size() << ",\n";
     OS << "};\n\n";
 
@@ -525,7 +331,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
           "ImplToLibcall[RTLIB::NumLibcallImpls] = {\n"
           "  RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported\n";
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       const RuntimeLibcall *Provides = LibCallImpl.getProvides();
       OS << "  ";
       Provides->emitEnumEntry(OS);
@@ -533,6 +340,7 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
       LibCallImpl.emitEnumEntry(OS);
       OS << '\n';
     }
+
     OS << "};\n\n";
   }
 
@@ -576,7 +384,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
              std::pair<std::vector<const Record *>, const Record *>>
         Func2Preds;
     Sets.addExpander("LibcallImpls", std::make_unique<LibcallPredicateExpander>(
-                                         *this, Func2Preds));
+                                         Libcalls, Func2Preds));
 
     const SetTheory::RecVec *Elements =
         Sets.expand(R->getValueAsDef("MemberList"));
@@ -589,11 +397,12 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
     constexpr unsigned BitsPerStorageElt = 64;
     DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs;
 
-    SmallVector<uint64_t, 32> BitsetValues(
-        divideCeil(RuntimeLibcallImplDefList.size() + 1, BitsPerStorageElt));
+    SmallVector<uint64_t, 32> BitsetValues(divideCeil(
+        Libcalls.getRuntimeLibcallImplDefList().size() + 1, BitsPerStorageElt));
 
     for (const Record *Elt : *Elements) {
-      const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt);
+      const RuntimeLibcallImpl *LibCallImpl =
+          Libcalls.getRuntimeLibcallImpl(Elt);
       if (!LibCallImpl) {
         PrintError(R, "entry for SystemLibrary is not a RuntimeLibcallImpl");
         PrintNote(Elt->getLoc(), "invalid entry `" + Elt->getName() + "`");
@@ -761,39 +570,5 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
   }
 }
 
-void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
-                                      SetTheory::RecSet &Elts) {
-  assert(Def->isSubClassOf("LibcallImpls"));
-
-  SetTheory::RecSet TmpElts;
-
-  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
-
-  Elts.insert(TmpElts.begin(), TmpElts.end());
-
-  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
-  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
-
-  // This is assuming we aren't conditionally applying a calling convention to
-  // some subsets, and not another, but this doesn't appear to be used.
-
-  for (const Record *LibcallImplDef : TmpElts) {
-    const RuntimeLibcallImpl *LibcallImpl =
-        LibcallEmitter.getRuntimeLibcallImpl(LibcallImplDef);
-    if (!AP.isAlwaysAvailable() || CCClass) {
-      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
-      if (!Inserted) {
-        PrintError(
-            Def,
-            "combining nested libcall set predicates currently unhandled: '" +
-                LibcallImpl->getLibcallFuncName() + "'");
-      }
-
-      It->second.first.push_back(AP.getDef());
-      It->second.second = CCClass;
-    }
-  }
-}
-
 static TableGen::Emitter::OptClass<RuntimeLibcallEmitter>
     X("gen-runtime-libcalls", "Generate RuntimeLibcalls");

From b0ae054a568622982e7e623c354709a7463b152a Mon Sep 17 00:00:00 2001
From: YongKang Zhu <yongzhu@fb.com>
Date: Wed, 5 Nov 2025 12:01:58 -0800
Subject: [PATCH 55/61] [BOLT][AArch64] Fix LDR relocation type in ADRP+LDR
 sequence (#166391)

`R_AARCH64_ADD_ABS_LO12_NC` is for the `ADD` instruction in the
`ADRP+ADD` sequence. For `ADRP+LDR` sequence generated in LDR
relaxation, relocation type for `LDR` should be
`R_AARCH64_LDST64_ABS_LO12_NC` if it is 64-bit integer load or
`R_AARCH64_LDST32_ABS_LO12_NC` if 32-bit.

Sorry should have included this in #165787.
---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 57db6a436c5c6..3c77091d91ebd 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -640,7 +640,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Insts[1].addOperand(MCOperand::createImm(0));
     Insts[1].addOperand(MCOperand::createImm(0));
     setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx,
-                          ELF::R_AARCH64_ADD_ABS_LO12_NC);
+                          isLDRXl(LDRInst) ? ELF::R_AARCH64_LDST64_ABS_LO12_NC
+                                           : ELF::R_AARCH64_LDST32_ABS_LO12_NC);
     return Insts;
   }
 

From f76c132230326a296c4fb8f7cb6c0fb6b943fadb Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 5 Nov 2025 12:02:33 -0800
Subject: [PATCH 56/61] [SimplifyCFG] Fix weight calculation for
 `simplifySwitchOfPowersOfTwo` (#165956)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continued from #165804

This maintains the BFI of the default branch. Originally `10/63`​, post-pass, it ends up being `5/63 + 58/63 * 5/58`​(first term is from `PROF`​, second is the probability of going to the switch lookup times the probability, there, of taking the default branch)

Issue #147390
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 21 ++++++++++++-------
 .../X86/switch-of-powers-of-two.ll            |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 9a8dbebe5bfba..37c048f421f1a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7731,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
       // label. The other is those powers of 2 that don't appear in the case
       // statement. We don't know the distribution of the values coming in, so
       // the safest is to split 50-50 the original probability to `default`.
-      uint64_t OrigDenominator = sum_of(map_range(
-          Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+      uint64_t OrigDenominator =
+          sum_of(map_range(Weights, StaticCastTo<uint64_t>));
       SmallVector<uint64_t> NewWeights(2);
       NewWeights[1] = Weights[0] / 2;
       NewWeights[0] = OrigDenominator - NewWeights[1];
       setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
-
-      // For the original switch, we reduce the weight of the default by the
-      // amount by which the previous branch contributes to getting to default,
-      // and then make sure the remaining weights have the same relative ratio
-      // wrt eachother.
+      // The probability of executing the default block stays constant. It was
+      //  p_d = Weights[0] / OrigDenominator
+      //  we rewrite as W/D
+      // We want to find the probability of the default branch of the switch
+      // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D)
+      // i.e. the original probability is the probability we go to the default
+      // branch from the BI branch, or we take the default branch on the SI.
+      // Meaning X = W / (2D - W), or (W/2) / (D - W/2)
+      // This matches using W/2 for the default branch probability numerator and
+      // D-W/2 as the denominator.
+      Weights[0] = NewWeights[1];
       uint64_t CasesDenominator = OrigDenominator - Weights[0];
-      Weights[0] /= 2;
       for (auto &W : drop_begin(Weights))
         W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
index d818335f075e5..e48c2b46a138a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
@@ -141,5 +141,5 @@ return:
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 56, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 53, i32 5}
 ;.

From 78d649199b47370b72848c1ca8d9bd3323b050ac Mon Sep 17 00:00:00 2001
From: Ramkrishnan <ramkrishnan.nk@huawei.com>
Date: Wed, 5 Nov 2025 15:06:30 -0500
Subject: [PATCH 57/61] [InterleavedAccess] Construct interleaved access store
 with shuffles

Cost of interleaved store of 8 factor and 16 factor are cheaper in AArch64 with additional interleave instructions.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   5 +
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  13 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 131 +++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   7 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |  38 ++++-
 llvm/test/CodeGen/AArch64/vldn_shuffle.ll     | 105 ++++++++++++++
 .../AArch64/interleaved_store.ll              | 117 ++++++++++++++++
 .../AArch64/replicating-load-store-costs.ll   |   2 +-
 .../PhaseOrdering/AArch64/interleave_vec.ll   |  16 +--
 9 files changed, 416 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2550c2bee5f71..8aeaa9cdacfc1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3233,6 +3233,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
+  /// Return true if the target interleave with shuffles are cheaper
+  virtual bool isProfitableToInterleaveWithGatherScatter() const {
+    return false;
+  }
+
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5c27a20869f81..45eca28ffb8a2 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor) {
+                               unsigned MaxFactor,
+                               bool InterleaveWithShuffles) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
+  if (InterleaveWithShuffles) {
+    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+      Factor = i * MaxFactor;
+      if (SVI->isInterleave(Factor))
+        return true;
+    }
+  }
   return false;
 }
 
@@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+                          TLI->isProfitableToInterleaveWithGatherScatter()))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d08f9b94227a2..298746863d221 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -96,6 +96,7 @@
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -17989,11 +17990,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
 
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
+
+  if (isProfitableToInterleaveWithGatherScatter() &&
+      Factor > getMaxSupportedInterleaveFactor())
+    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18139,6 +18146,126 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
+/// If the interleaved vector elements are greater than supported MaxFactor,
+/// interleaving the data with additional shuffles can be used to
+/// achieve the same.
+///
+/// Consider the following data with 8 interleaves which are shuffled to store
+/// stN instructions. Data needs to be stored in this order:
+///     [v0, v1, v2, v3, v4, v5, v6, v7]
+///
+///    v0      v4      v2      v6      v1      v5      v3      v7
+///    |       |       |       |       |       |       |       |
+///     \     /         \     /         \     /         \     /
+///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
+///        |               |              |                 |
+///         \             /                \               /
+///          \           /                  \             /
+///           \         /                    \           /
+///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
+///
+/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
+/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
+/// another st4.
+///
+/// For stN = 2, upper half of interleaved data V0, V1 is stored
+/// with one st2 instruction. Second set V2, V3 is stored with another st2.
+/// Total of 4 st2's are required here.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
+
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+    return false;
+
+  if (UseScalable)
+    return false;
+
+  std::deque<Value *> Shuffles;
+  Shuffles.push_back(SVI);
+  unsigned ConcatLevel = Factor;
+  // Getting all the interleaved operands.
+  while (ConcatLevel > 1) {
+    unsigned InterleavedOperands = Shuffles.size();
+    for (unsigned i = 0; i < InterleavedOperands; i++) {
+      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
+      if (!SFL)
+        return false;
+      Shuffles.pop_front();
+
+      Value *Op0 = SFL->getOperand(0);
+      Value *Op1 = SFL->getOperand(1);
+
+      Shuffles.push_back(dyn_cast<Value>(Op0));
+      Shuffles.push_back(dyn_cast<Value>(Op1));
+    }
+    ConcatLevel >>= 1;
+  }
+
+  IRBuilder<> Builder(SI);
+  auto Mask = createInterleaveMask(LaneLen, 2);
+  SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
+  for (unsigned i = 0; i < LaneLen; i++) {
+    LowerHalfMask[i] = Mask[i];
+    UpperHalfMask[i] = Mask[i + LaneLen];
+  }
+
+  unsigned InterleaveFactor = Factor >> 1;
+  while (InterleaveFactor >= MaxSupportedFactor) {
+    std::deque<Value *> ShufflesIntermediate;
+    ShufflesIntermediate.resize(Factor);
+    for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+      for (unsigned i = 0; i < InterleaveFactor; i++) {
+        auto *Shuffle = Builder.CreateShuffleVector(
+            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
+        ShufflesIntermediate[i + j] = Shuffle;
+        Shuffle = Builder.CreateShuffleVector(
+            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
+        ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
+      }
+    }
+    Shuffles = ShufflesIntermediate;
+    InterleaveFactor >>= 1;
+  }
+
+  Type *PtrTy = SI->getPointerOperandType();
+  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+  Value *BaseAddr = SI->getPointerOperand();
+  Function *StNFunc = getStructuredStoreFunction(
+      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+  for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+    SmallVector<Value *, 5> Ops;
+    for (unsigned j = 0; j < MaxSupportedFactor; j++)
+      Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
+
+    if (i > 0) {
+      // We will compute the pointer operand of each store from the original
+      // base address using GEPs. Cast the base address to a pointer to the
+      // scalar  element type.
+      BaseAddr = Builder.CreateConstGEP1_32(
+          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+    }
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..bfd8474bfeec9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
+  bool isProfitableToInterleaveWithGatherScatter() const override {
+    return true;
+  }
+
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
+  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+                                        unsigned Factor) const;
+
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..8729ed3890131 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,11 +4922,36 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  unsigned NumLoadStores = 1;
+  InstructionCost ShuffleCost = 0;
+  bool isInterleaveWithShuffle = false;
+  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  auto *SubVecTy =
+      VectorType::get(VecVTy->getElementType(),
+                      VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+  if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+      Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+      Factor > MaxSupportedFactor) {
+    isInterleaveWithShuffle = true;
+    SmallVector<int, 16> Mask;
+    // preparing interleave Mask.
+    for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+         i++) {
+      for (unsigned j = 0; j < 2; j++)
+        Mask.push_back(j * Factor + i);
+    }
+
+    NumLoadStores = Factor / MaxSupportedFactor;
+    ShuffleCost =
+        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+                                 Mask, CostKind, 0, SubVecTy));
+  }
+
+  if (!UseMaskForGaps &&
+      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
-    auto *SubVecTy =
-        VectorType::get(VecVTy->getElementType(),
-                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -4934,7 +4959,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+      return (Factor *
+              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+              NumLoadStores) +
+             ShuffleCost;
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9cf85bd6..b2635d3d9f1a5 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,111 @@ entry:
   ret void
 }
 
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+                                     <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK:       .Lfunc_begin17:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:  zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT:  zip2	[[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT:  zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT:  zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT:  zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT:  zip2	[[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT:  zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT:  zip2	[[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT:  st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT:  st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT:  ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0,  <4 x i32> %a1,  <4 x i32> %a2,  <4 x i32> %a3,
+                                      <4 x i32> %a4,  <4 x i32> %a5,  <4 x i32> %a6,  <4 x i32> %a7,
+                                      <4 x i32> %a8,  <4 x i32> %a9,  <4 x i32> %a10, <4 x i32> %a11,
+                                      <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK:       .Lfunc_begin18:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:      	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT:  	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT:  	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT:  	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT:  	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT:  	zip2	[[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT:  	zip2	[[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT:  	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT:  	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT:  	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT:  	zip2	[[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT:  	zip2	[[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT:  	zip2	[[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT:  	zip1	[[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip2	[[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT:  	zip2	[[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip1	[[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip2	[[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT:  	zip2	[[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT:  	zip1	[[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip2	[[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip1	[[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	zip1	[[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip2	[[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip1	[[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip2	[[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	zip2	[[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip1	[[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip2	[[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip1	[[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	zip2	[[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip2	[[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT:  	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+; CHECK-NEXT:  	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:  	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT:  	add	x8, x0, #128
+; CHECK-NEXT:  	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+; CHECK-NEXT:  	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT:  	add	x8, x0, #192
+; CHECK-NEXT:  	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT:  	ldp	d15, d14, [sp], #64             // 16-byte Folded Reload
+; CHECK-NEXT:  	ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32>  <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  store <64 x i32> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
new file mode 100644
index 0000000000000..bd5f4e2a3279b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16  -S < %s | FileCheck %s
+
+define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) {
+; CHECK-LABEL: define dso_local void @_Z6unpackPhS_(
+; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) {
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ]
+  %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ]
+  %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ]
+  store i8 0, ptr %out.addr.032, align 1
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3
+  %0 = load i8, ptr %arrayidx10, align 1
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1
+  store i8 %0, ptr %arrayidx14, align 1
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2
+  %1 = load i8, ptr %arrayidx10.1, align 1
+  %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2
+  store i8 %1, ptr %arrayidx14.1, align 1
+  %add.2 = add i8 %0, %1
+  %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3
+  store i8 %add.2, ptr %arrayidx14.2, align 1
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1
+  %2 = load i8, ptr %arrayidx10.3, align 1
+  %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4
+  store i8 %2, ptr %arrayidx14.3, align 1
+  %add.4 = add i8 %0, %2
+  %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5
+  store i8 %add.4, ptr %arrayidx14.4, align 1
+  %add.5 = add i8 %1, %2
+  %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6
+  store i8 %add.5, ptr %arrayidx14.5, align 1
+  %add.6 = add i8 %0, %add.5
+  %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7
+  store i8 %add.6, ptr %arrayidx14.6, align 1
+  %3 = load i8, ptr %in.addr.031, align 1
+  %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8
+  store i8 %3, ptr %arrayidx14.7, align 1
+  %add.8 = add i8 %0, %3
+  %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9
+  store i8 %add.8, ptr %arrayidx14.8, align 1
+  %add.9 = add i8 %1, %3
+  %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10
+  store i8 %add.9, ptr %arrayidx14.9, align 1
+  %add.10 = add i8 %0, %add.9
+  %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11
+  store i8 %add.10, ptr %arrayidx14.10, align 1
+  %add.11 = add i8 %2, %3
+  %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12
+  store i8 %add.11, ptr %arrayidx14.11, align 1
+  %add.12 = add i8 %0, %add.11
+  %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13
+  store i8 %add.12, ptr %arrayidx14.12, align 1
+  %add.13 = add i8 %1, %add.11
+  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
+  store i8 %add.13, ptr %arrayidx14.13, align 1
+  %add.14 = add i8 %0, %add.13
+  %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15
+  store i8 %add.14, ptr %arrayidx14.14, align 1
+  %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16
+  %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4
+  %inc17 = add nuw nsw i32 %i.033, 1
+  %exitcond.not = icmp eq i32 %inc17, 32
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index 68cfc659e1e94..cdddcc9fc4226 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
-; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s
 
 target triple = "arm64-apple-macosx15.0.0"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index f2ae327778f4a..54b7f2afe1ed0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef
 ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]]
-; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]]
+; CHECK-NEXT:    store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[FOR_END11]]:

From 1fc5c02aa56ad4cef1391863dfc0922ef7110569 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 5 Nov 2025 12:13:48 -0800
Subject: [PATCH 58/61] [LVer][profcheck] explicitly set unknown branch weights
 for the versioned/unversioned selector (#164507)

We don't have sufficient information to know when the versioned (or unversioned) loop variant will be taken, so we mark the branch as having "unknown" probabilities.

Issue #147390
---
 llvm/lib/Transforms/Utils/LoopVersioning.cpp             | 9 +++++++--
 .../Transforms/LoopDistribute/basic-with-memchecks.ll    | 5 +++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..9c8b6ef83e56d 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,12 @@ void LoopVersioning::versionLoop(
   // Insert the conditional branch based on the result of the memchecks.
   Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
   Builder.SetInsertPoint(OrigTerm);
-  Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
-                       VersionedLoop->getLoopPreheader());
+  auto *BI =
+      Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+                           VersionedLoop->getLoopPreheader());
+  // We don't know what the probability of executing the versioned vs the
+  // unversioned variants is.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE);
   OrigTerm->eraseFromParent();
 
   // The loops merge in the original exit block.  This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 @E = common global ptr null, align 8
 
 ; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
 entry:
   %a = load ptr, ptr @A, align 8
   %b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
 ; CHECK:     = icmp
 
 ; CHECK-NOT: = icmp
-; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
+; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
 
 ; The non-distributed loop that the memchecks fall back on.
 
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}

From 163933e9e7099f352ff8df1973f9a9c3d7def6c5 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 5 Nov 2025 20:17:29 +0000
Subject: [PATCH 59/61] [gn build] Port 0469ff0a212d

---
 llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
index 43916cef756ff..918132b38b6ed 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
@@ -10,6 +10,7 @@ source_set("Basic") {
     "DirectiveEmitter.cpp",
     "IntrinsicEmitter.cpp",
     "RISCVTargetDefEmitter.cpp",
+    "RuntimeLibcalls.cpp",
     "RuntimeLibcallsEmitter.cpp",
     "SDNodeProperties.cpp",
     "TableGen.cpp",

From 5f1b9023a8093fd8beb931a74d28753fbda88fdf Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Wed, 5 Nov 2025 12:36:57 -0800
Subject: [PATCH 60/61] [BOLT][AArch64] Fix printing of relocation types
 (#166621)

Enumeration of relocation types is not always sequential, e.g. on
AArch64 the first real relocation type is 0x101. As such, the existing
code in `Relocation::print()` was crashing while printing AArch64
relocations. Fix it by using `llvm::object::getELFRelocationTypeName()`.
---
 bolt/lib/Core/Relocation.cpp              | 38 ++++-------------------
 bolt/test/AArch64/relocation-type-print.s | 24 ++++++++++++++
 2 files changed, 30 insertions(+), 32 deletions(-)
 create mode 100644 bolt/test/AArch64/relocation-type-print.s

diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index 4b827b647b06c..f872db2cae0ce 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -1018,41 +1018,15 @@ void Relocation::print(raw_ostream &OS) const {
   default:
     OS << "RType:" << Twine::utohexstr(Type);
     break;
-
-  case Triple::aarch64: {
-    static const char *const AArch64RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(AArch64RelocNames).size());
-    OS << AArch64RelocNames[Type];
-  } break;
-
+  case Triple::aarch64:
+    OS << object::getELFRelocationTypeName(ELF::EM_AARCH64, Type);
+    break;
   case Triple::riscv64:
-    // RISC-V relocations are not sequentially numbered so we cannot use an
-    // array
-    switch (Type) {
-    default:
-      llvm_unreachable("illegal RISC-V relocation");
-#define ELF_RELOC(name, value)                                                 \
-  case value:                                                                  \
-    OS << #name;                                                               \
+    OS << object::getELFRelocationTypeName(ELF::EM_RISCV, Type);
     break;
-#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
-#undef ELF_RELOC
-    }
+  case Triple::x86_64:
+    OS << object::getELFRelocationTypeName(ELF::EM_X86_64, Type);
     break;
-
-  case Triple::x86_64: {
-    static const char *const X86RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(X86RelocNames).size());
-    OS << X86RelocNames[Type];
-  } break;
   }
   OS << ", 0x" << Twine::utohexstr(Offset);
   if (Symbol) {
diff --git a/bolt/test/AArch64/relocation-type-print.s b/bolt/test/AArch64/relocation-type-print.s
new file mode 100644
index 0000000000000..111cbbb94bc54
--- /dev/null
+++ b/bolt/test/AArch64/relocation-type-print.s
@@ -0,0 +1,24 @@
+## Verify that llvm-bolt correctly prints relocation types.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -nostartfiles %s -o %t.exe -Wl,-q,--no-relax
+# RUN: llvm-bolt %t.exe --print-cfg --print-relocations -o %t.bolt \
+# RUN:   | FileCheck %s
+
+  .section .text
+  .align 4
+  .globl _start
+  .type _start, %function
+_start:
+
+  adrp x0, _start
+# CHECK: adrp
+# CHECK-SAME: R_AARCH64_ADR_PREL_PG_HI21
+
+  add x0, x0, :lo12:_start
+# CHECK-NEXT: add
+# CHECK-SAME: R_AARCH64_ADD_ABS_LO12_NC
+
+  ret
+  .size _start, .-_start

From 54190970cf275fd1d8a99b7c84a6a106fd543c3d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 5 Nov 2025 20:57:51 +0000
Subject: [PATCH 61/61] [LV] Add tests for narrowing interleave groups with
 casts.

Add additional tests for narrowing interleave groups with casts.
---
 ...to-widen-memory-with-wide-ops-and-casts.ll | 694 ++++++++++++++++++
 1 file changed, 694 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
new file mode 100644
index 0000000000000..bba7d058d6637
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @test_2xi64_matching_zext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_sext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP5]], splat (i64 2)
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = sext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = sub i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 2, %l.0
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}